# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pandas.api.types import is_datetime64_any_dtype as is_datetime
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
%matplotlib inline
DATA_DIRECTORY = 'data'
data = pd.read_csv(DATA_DIRECTORY+'/prosperLoanData.csv')
df_loan = data.copy()
df_loan.head()
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperRating (Alpha) | ProsperScore | ListingCategory (numeric) | BorrowerState | Occupation | EmploymentStatus | EmploymentStatusDuration | IsBorrowerHomeowner | CurrentlyInGroup | GroupKey | DateCreditPulled | CreditScoreRangeLower | CreditScoreRangeUpper | FirstRecordedCreditLine | CurrentCreditLines | OpenCreditLines | TotalCreditLinespast7years | OpenRevolvingAccounts | OpenRevolvingMonthlyPayment | InquiriesLast6Months | TotalInquiries | CurrentDelinquencies | AmountDelinquent | DelinquenciesLast7Years | PublicRecordsLast10Years | PublicRecordsLast12Months | RevolvingCreditBalance | BankcardUtilization | AvailableBankcardCredit | TotalTrades | TradesNeverDelinquent (percentage) | TradesOpenedLast6Months | DebtToIncomeRatio | IncomeRange | IncomeVerifiable | StatedMonthlyIncome | LoanKey | TotalProsperLoans | TotalProsperPaymentsBilled | OnTimeProsperPayments | ProsperPaymentsLessThanOneMonthLate | ProsperPaymentsOneMonthPlusLate | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | ScorexChangeAtTimeOfListing | LoanCurrentDaysDelinquent | LoanFirstDefaultedCycleNumber | LoanMonthsSinceOrigination | LoanNumber | LoanOriginalAmount | LoanOriginationDate | LoanOriginationQuarter | MemberKey | MonthlyLoanPayment | LP_CustomerPayments | LP_CustomerPrincipalPayments | LP_InterestandFees | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021339766868145413AB3B | 193129 | 2007-08-26 19:09:29.263000000 | C | 36 | Completed | 2009-08-14 00:00:00 | 0.16516 | 0.1580 | 0.1380 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | CO | Other | Self-employed | 2.0 | True | True | NaN | 2007-08-26 18:41:46.780000000 | 640.0 | 659.0 | 2001-10-11 00:00:00 | 5.0 | 4.0 | 12.0 | 1 | 24.0 | 3.0 | 3.0 | 2.0 | 472.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.00 | 1500.0 | 11.0 | 0.81 | 0.0 | 0.17 | $25,000-49,999 | True | 3083.333333 | E33A3400205839220442E84 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 78 | 19141 | 9425 | 2007-09-12 00:00:00 | Q3 2007 | 1F3E3376408759268057EDA | 330.43 | 11396.14 | 9425.00 | 1971.14 | -133.18 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 258 |
| 1 | 10273602499503308B223C1 | 1209647 | 2014-02-27 08:28:07.900000000 | NaN | 36 | Current | NaN | 0.12016 | 0.0920 | 0.0820 | 0.07960 | 0.0249 | 0.05470 | 6.0 | A | 7.0 | 2 | CO | Professional | Employed | 44.0 | False | False | NaN | 2014-02-27 08:28:14 | 680.0 | 699.0 | 1996-03-18 00:00:00 | 14.0 | 14.0 | 29.0 | 13 | 389.0 | 3.0 | 5.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 3989.0 | 0.21 | 10266.0 | 29.0 | 1.00 | 2.0 | 0.18 | $50,000-74,999 | True | 6125.000000 | 9E3B37071505919926B1D82 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 0 | 134815 | 10000 | 2014-03-03 00:00:00 | Q1 2014 | 1D13370546739025387B2F4 | 318.93 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 2 | 0EE9337825851032864889A | 81716 | 2007-01-05 15:00:47.090000000 | HR | 36 | Completed | 2009-12-17 00:00:00 | 0.28269 | 0.2750 | 0.2400 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | GA | Other | Not available | NaN | False | True | 783C3371218786870A73D20 | 2007-01-02 14:09:10.060000000 | 480.0 | 499.0 | 2002-07-27 00:00:00 | NaN | NaN | 3.0 | 0 | 0.0 | 0.0 | 1.0 | 1.0 | NaN | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.06 | Not displayed | True | 2083.333333 | 6954337960046817851BCB2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 86 | 6466 | 3001 | 2007-01-17 00:00:00 | Q1 2007 | 5F7033715035555618FA612 | 123.32 | 4186.63 | 3001.00 | 1185.63 | -24.20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 41 |
| 3 | 0EF5356002482715299901A | 658116 | 2012-10-22 11:02:35.010000000 | NaN | 36 | Current | NaN | 0.12528 | 0.0974 | 0.0874 | 0.08490 | 0.0249 | 0.06000 | 6.0 | A | 9.0 | 16 | GA | Skilled Labor | Employed | 113.0 | True | False | NaN | 2012-10-22 11:02:32 | 800.0 | 819.0 | 1983-02-28 00:00:00 | 5.0 | 5.0 | 29.0 | 7 | 115.0 | 0.0 | 1.0 | 4.0 | 10056.0 | 14.0 | 0.0 | 0.0 | 1444.0 | 0.04 | 30754.0 | 26.0 | 0.76 | 0.0 | 0.15 | $25,000-49,999 | True | 2875.000000 | A0393664465886295619C51 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 16 | 77296 | 10000 | 2012-11-01 00:00:00 | Q4 2012 | 9ADE356069835475068C6D2 | 321.45 | 5143.20 | 4091.09 | 1052.11 | -108.01 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 158 |
| 4 | 0F023589499656230C5E3E2 | 909464 | 2013-09-14 18:38:39.097000000 | NaN | 36 | Current | NaN | 0.24614 | 0.2085 | 0.1985 | 0.18316 | 0.0925 | 0.09066 | 3.0 | D | 4.0 | 2 | MN | Executive | Employed | 44.0 | True | False | NaN | 2013-09-14 18:38:44 | 680.0 | 699.0 | 2004-02-20 00:00:00 | 19.0 | 19.0 | 49.0 | 6 | 220.0 | 1.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6193.0 | 0.81 | 695.0 | 39.0 | 0.95 | 2.0 | 0.26 | $100,000+ | True | 9583.333333 | A180369302188889200689E | 1.0 | 11.0 | 11.0 | 0.0 | 0.0 | 11000.0 | 9947.9 | NaN | 0 | NaN | 6 | 102670 | 15000 | 2013-09-20 00:00:00 | Q3 2013 | 36CE356043264555721F06C | 563.97 | 2819.85 | 1563.22 | 1256.63 | -60.27 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 20 |
df_loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingKey 113937 non-null object 1 ListingNumber 113937 non-null int64 2 ListingCreationDate 113937 non-null object 3 CreditGrade 28953 non-null object 4 Term 113937 non-null int64 5 LoanStatus 113937 non-null object 6 ClosedDate 55089 non-null object 7 BorrowerAPR 113912 non-null float64 8 BorrowerRate 113937 non-null float64 9 LenderYield 113937 non-null float64 10 EstimatedEffectiveYield 84853 non-null float64 11 EstimatedLoss 84853 non-null float64 12 EstimatedReturn 84853 non-null float64 13 ProsperRating (numeric) 84853 non-null float64 14 ProsperRating (Alpha) 84853 non-null object 15 ProsperScore 84853 non-null float64 16 ListingCategory (numeric) 113937 non-null int64 17 BorrowerState 108422 non-null object 18 Occupation 110349 non-null object 19 EmploymentStatus 111682 non-null object 20 EmploymentStatusDuration 106312 non-null float64 21 IsBorrowerHomeowner 113937 non-null bool 22 CurrentlyInGroup 113937 non-null bool 23 GroupKey 13341 non-null object 24 DateCreditPulled 113937 non-null object 25 CreditScoreRangeLower 113346 non-null float64 26 CreditScoreRangeUpper 113346 non-null float64 27 FirstRecordedCreditLine 113240 non-null object 28 CurrentCreditLines 106333 non-null float64 29 OpenCreditLines 106333 non-null float64 30 TotalCreditLinespast7years 113240 non-null float64 31 OpenRevolvingAccounts 113937 non-null int64 32 OpenRevolvingMonthlyPayment 113937 non-null float64 33 InquiriesLast6Months 113240 non-null float64 34 TotalInquiries 112778 non-null float64 35 CurrentDelinquencies 113240 non-null float64 36 AmountDelinquent 106315 non-null float64 37 DelinquenciesLast7Years 112947 non-null float64 38 PublicRecordsLast10Years 113240 non-null float64 39 PublicRecordsLast12Months 106333 non-null float64 40 RevolvingCreditBalance 106333 non-null float64 41 BankcardUtilization 106333 non-null float64 42 AvailableBankcardCredit 106393 non-null float64 43 TotalTrades 106393 non-null float64 44 TradesNeverDelinquent (percentage) 106393 non-null float64 45 TradesOpenedLast6Months 106393 non-null float64 46 DebtToIncomeRatio 105383 non-null float64 47 IncomeRange 113937 non-null object 48 IncomeVerifiable 113937 non-null bool 49 StatedMonthlyIncome 113937 non-null float64 50 LoanKey 113937 non-null object 51 TotalProsperLoans 22085 non-null float64 52 TotalProsperPaymentsBilled 22085 non-null float64 53 OnTimeProsperPayments 22085 non-null float64 54 ProsperPaymentsLessThanOneMonthLate 22085 non-null float64 55 ProsperPaymentsOneMonthPlusLate 22085 non-null float64 56 ProsperPrincipalBorrowed 22085 non-null float64 57 ProsperPrincipalOutstanding 22085 non-null float64 58 ScorexChangeAtTimeOfListing 18928 non-null float64 59 LoanCurrentDaysDelinquent 113937 non-null int64 60 LoanFirstDefaultedCycleNumber 16952 non-null float64 61 LoanMonthsSinceOrigination 113937 non-null int64 62 LoanNumber 113937 non-null int64 63 LoanOriginalAmount 113937 non-null int64 64 LoanOriginationDate 113937 non-null object 65 LoanOriginationQuarter 113937 non-null object 66 MemberKey 113937 non-null object 67 MonthlyLoanPayment 113937 non-null float64 68 LP_CustomerPayments 113937 non-null float64 69 LP_CustomerPrincipalPayments 113937 non-null float64 70 LP_InterestandFees 113937 non-null float64 71 LP_ServiceFees 113937 non-null float64 72 LP_CollectionFees 113937 non-null float64 73 LP_GrossPrincipalLoss 113937 non-null float64 74 LP_NetPrincipalLoss 113937 non-null float64 75 LP_NonPrincipalRecoverypayments 113937 non-null float64 76 PercentFunded 113937 non-null float64 77 Recommendations 113937 non-null int64 78 InvestmentFromFriendsCount 113937 non-null int64 79 InvestmentFromFriendsAmount 113937 non-null float64 80 Investors 113937 non-null int64 dtypes: bool(3), float64(50), int64(11), object(17) memory usage: 68.1+ MB
df_loan.dtypes.value_counts()
float64 50 object 17 int64 11 bool 3 dtype: int64
plt.figure(figsize=(20,10))
sns.heatmap(df_loan.isnull())
plt.show()
(df_loan.isnull().sum() *100/ len(df_loan)).sort_values(ascending=False)
GroupKey 88.290898 LoanFirstDefaultedCycleNumber 85.121602 ScorexChangeAtTimeOfListing 83.387311 ProsperPrincipalOutstanding 80.616481 ProsperPrincipalBorrowed 80.616481 ProsperPaymentsOneMonthPlusLate 80.616481 ProsperPaymentsLessThanOneMonthLate 80.616481 OnTimeProsperPayments 80.616481 TotalProsperLoans 80.616481 TotalProsperPaymentsBilled 80.616481 CreditGrade 74.588588 ClosedDate 51.649596 ProsperRating (Alpha) 25.526387 ProsperScore 25.526387 EstimatedReturn 25.526387 ProsperRating (numeric) 25.526387 EstimatedLoss 25.526387 EstimatedEffectiveYield 25.526387 DebtToIncomeRatio 7.507658 EmploymentStatusDuration 6.692295 AmountDelinquent 6.689662 CurrentCreditLines 6.673864 BankcardUtilization 6.673864 PublicRecordsLast12Months 6.673864 OpenCreditLines 6.673864 RevolvingCreditBalance 6.673864 TotalTrades 6.621203 TradesOpenedLast6Months 6.621203 TradesNeverDelinquent (percentage) 6.621203 AvailableBankcardCredit 6.621203 BorrowerState 4.840394 Occupation 3.149109 EmploymentStatus 1.979164 TotalInquiries 1.017229 DelinquenciesLast7Years 0.868901 FirstRecordedCreditLine 0.611742 PublicRecordsLast10Years 0.611742 TotalCreditLinespast7years 0.611742 CurrentDelinquencies 0.611742 InquiriesLast6Months 0.611742 CreditScoreRangeUpper 0.518708 CreditScoreRangeLower 0.518708 BorrowerAPR 0.021942 LoanOriginalAmount 0.000000 InvestmentFromFriendsAmount 0.000000 InvestmentFromFriendsCount 0.000000 Recommendations 0.000000 PercentFunded 0.000000 LP_NonPrincipalRecoverypayments 0.000000 LP_NetPrincipalLoss 0.000000 LP_CollectionFees 0.000000 LoanOriginationDate 0.000000 LP_GrossPrincipalLoss 0.000000 MemberKey 0.000000 MonthlyLoanPayment 0.000000 LP_ServiceFees 0.000000 LoanNumber 0.000000 LP_InterestandFees 0.000000 LP_CustomerPrincipalPayments 0.000000 LP_CustomerPayments 0.000000 LoanOriginationQuarter 0.000000 ListingKey 0.000000 LoanMonthsSinceOrigination 0.000000 CurrentlyInGroup 0.000000 ListingCreationDate 0.000000 Term 0.000000 LoanStatus 0.000000 BorrowerRate 0.000000 LenderYield 0.000000 ListingCategory (numeric) 0.000000 IsBorrowerHomeowner 0.000000 DateCreditPulled 0.000000 LoanCurrentDaysDelinquent 0.000000 OpenRevolvingAccounts 0.000000 OpenRevolvingMonthlyPayment 0.000000 ListingNumber 0.000000 IncomeRange 0.000000 IncomeVerifiable 0.000000 StatedMonthlyIncome 0.000000 LoanKey 0.000000 Investors 0.000000 dtype: float64
On procèdera maintenant à l'évaluation des variables target et la sélection des variables qui nous aideront dans l'étude.
TARGET_NAME = {'categ':'LoanStatus', 'num':'BorrowerAPR'}
df_loan[TARGET_NAME['categ']].unique()
array(['Completed', 'Current', 'Past Due (1-15 days)', 'Defaulted',
'Chargedoff', 'Past Due (16-30 days)', 'Cancelled',
'Past Due (61-90 days)', 'Past Due (31-60 days)',
'Past Due (91-120 days)', 'FinalPaymentInProgress',
'Past Due (>120 days)'], dtype=object)
Pour cette étude on s'interessera uniquement aux valeurs suivantes (Completed, Chargedoff, Defaulted)
df_loan_status = df_loan.loc[df_loan[TARGET_NAME['categ']].isin(["Completed", "Chargedoff", "Defaulted"])].copy()
df_loan_status.sample(4)
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperRating (Alpha) | ProsperScore | ListingCategory (numeric) | BorrowerState | Occupation | EmploymentStatus | EmploymentStatusDuration | IsBorrowerHomeowner | CurrentlyInGroup | GroupKey | DateCreditPulled | CreditScoreRangeLower | CreditScoreRangeUpper | FirstRecordedCreditLine | CurrentCreditLines | OpenCreditLines | TotalCreditLinespast7years | OpenRevolvingAccounts | OpenRevolvingMonthlyPayment | InquiriesLast6Months | TotalInquiries | CurrentDelinquencies | AmountDelinquent | DelinquenciesLast7Years | PublicRecordsLast10Years | PublicRecordsLast12Months | RevolvingCreditBalance | BankcardUtilization | AvailableBankcardCredit | TotalTrades | TradesNeverDelinquent (percentage) | TradesOpenedLast6Months | DebtToIncomeRatio | IncomeRange | IncomeVerifiable | StatedMonthlyIncome | LoanKey | TotalProsperLoans | TotalProsperPaymentsBilled | OnTimeProsperPayments | ProsperPaymentsLessThanOneMonthLate | ProsperPaymentsOneMonthPlusLate | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | ScorexChangeAtTimeOfListing | LoanCurrentDaysDelinquent | LoanFirstDefaultedCycleNumber | LoanMonthsSinceOrigination | LoanNumber | LoanOriginalAmount | LoanOriginationDate | LoanOriginationQuarter | MemberKey | MonthlyLoanPayment | LP_CustomerPayments | LP_CustomerPrincipalPayments | LP_InterestandFees | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 94245 | 29733535255800571D6A451 | 547165 | 2011-12-28 08:21:56.477000000 | NaN | 36 | Completed | 2013-02-05 00:00:00 | 0.12528 | 0.0974 | 0.0874 | 0.0872 | 0.020 | 0.0672 | 6.0 | A | 8.0 | 1 | NC | Executive | Full-time | 115.0 | True | False | NaN | 2011-12-28 08:21:53 | 660.0 | 679.0 | 1994-12-14 00:00:00 | 16.0 | 15.0 | 50.0 | 9 | 1341.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 161306.0 | 0.97 | 3576.0 | 49.0 | 0.93 | 1.0 | 0.16 | $100,000+ | True | 20000.000000 | 151536399592119766EAE8C | 1.0 | 32.0 | 32.0 | 0.0 | 0.0 | 7500.0 | 0.0 | -144.0 | 0 | NaN | 26 | 58493 | 20000 | 2012-01-04 00:00:00 | Q1 2012 | 89843376489507664D14C57 | 642.91 | 21931.95 | 20000.00 | 1931.95 | -188.44 | -456.12 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 2 |
| 88743 | BF1A3461006353385E5AAF5 | 422116 | 2009-08-29 09:07:19.413000000 | NaN | 36 | Completed | 2011-07-11 00:00:00 | 0.14516 | 0.1238 | 0.1138 | 0.0924 | 0.021 | 0.0924 | 6.0 | A | 9.0 | 1 | IL | Other | Full-time | 15.0 | False | False | NaN | 2009-08-29 08:48:29 | 700.0 | 719.0 | 2002-08-15 00:00:00 | 13.0 | 13.0 | 17.0 | 14 | 453.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11585.0 | 0.66 | 5572.0 | 17.0 | 1.00 | 0.0 | 0.28 | $25,000-49,999 | True | 3083.333333 | C91635662960519016F1C5D | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 54 | 38500 | 5200 | 2009-09-16 00:00:00 | Q3 2009 | 76D03461287118113026425 | 173.66 | 6043.11 | 5200.01 | 843.10 | -68.09 | 0.00 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 145 |
| 44800 | A0C333831488041149B6981 | 102457 | 2007-02-22 09:00:23.270000000 | HR | 36 | Defaulted | 2007-09-25 00:00:00 | 0.30564 | 0.2900 | 0.2800 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | TX | Other | Full-time | 13.0 | False | True | B0473364376920128370B13 | 2007-02-16 09:14:02.660000000 | 540.0 | 559.0 | 1993-02-22 00:00:00 | 7.0 | 6.0 | 30.0 | 8 | 169.0 | 8.0 | 16.0 | 3.0 | 8166.0 | 15.0 | 1.0 | 1.0 | 2057.0 | 1.08 | 87.0 | 27.0 | 0.60 | 6.0 | 0.11 | $50,000-74,999 | True | 4166.666667 | 3359338310784822021419D | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 210 | 7.0 | 85 | 7829 | 2000 | 2007-02-27 00:00:00 | Q1 2007 | A6613381751205833E3D8D3 | 83.81 | 167.62 | 74.84 | 92.78 | -3.20 | 0.00 | 1925.16 | 1925.16 | 0.0 | 1.0 | 0 | 0 | 0.0 | 17 |
| 11109 | 8DAB3402769512501C1EF41 | 216903 | 2007-10-16 16:22:19.613000000 | D | 36 | Completed | 2009-06-26 00:00:00 | 0.15488 | 0.1400 | 0.1300 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | IA | Truck Driver | Full-time | 2.0 | False | False | NaN | 2007-09-28 16:34:18.517000000 | 620.0 | 639.0 | 2004-09-10 00:00:00 | 10.0 | 9.0 | 11.0 | 6 | 199.0 | 1.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7283.0 | 0.48 | 519.0 | 11.0 | 0.90 | 2.0 | 0.31 | $1-24,999 | True | 1250.000000 | 77E934041607006977B29F5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 77 | 21198 | 1200 | 2007-10-29 00:00:00 | Q4 2007 | 93793400081456706635C44 | 41.01 | 1411.50 | 1200.00 | 211.50 | -15.11 | 0.00 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 24 |
df_loan_status.shape
(55084, 81)
plt.figure(figsize=(10,5))
df_loan_status[TARGET_NAME['categ']].value_counts().plot(kind = 'bar')
plt.show()
On commencera d'abord par séparer les variables en quantitative set quantitatives:
categ_vars = df_loan_status.select_dtypes(['object', 'bool'])
numeric_vars = df_loan_status.select_dtypes(['float64', 'int64'])
categ_vars.shape[1], numeric_vars.shape[1]
(20, 61)
categ_vars.sample(4)
| ListingKey | ListingCreationDate | CreditGrade | LoanStatus | ClosedDate | ProsperRating (Alpha) | BorrowerState | Occupation | EmploymentStatus | IsBorrowerHomeowner | CurrentlyInGroup | GroupKey | DateCreditPulled | FirstRecordedCreditLine | IncomeRange | IncomeVerifiable | LoanKey | LoanOriginationDate | LoanOriginationQuarter | MemberKey | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53370 | 5717352267949575825AF8A | 2011-08-05 21:13:17.947000000 | NaN | Completed | 2013-03-14 00:00:00 | B | IL | Executive | Employed | True | False | NaN | 2011-08-05 21:13:13 | 1992-04-30 00:00:00 | $100,000+ | True | 74A536290442965308F722A | 2011-08-31 00:00:00 | Q3 2011 | 216A352294062256332E80D |
| 51550 | 6CD93380423764442AEDF27 | 2007-01-29 19:25:42.753000000 | C | Defaulted | 2008-06-08 00:00:00 | NaN | NaN | Other | Not available | True | False | NaN | 2007-01-29 19:16:37.260000000 | 1988-02-01 00:00:00 | Not displayed | True | 71F43380854142938A4CB6C | 2007-02-08 00:00:00 | Q1 2007 | 336B3381075214414768CE4 |
| 11238 | 5D543556396000598F8E66D | 2012-08-21 18:57:44.933000000 | NaN | Completed | 2012-12-13 00:00:00 | C | IN | Executive | Employed | True | False | NaN | 2012-08-21 18:57:40 | 1994-08-06 00:00:00 | $100,000+ | True | D206365913921309847F134 | 2012-08-28 00:00:00 | Q3 2012 | 25F03556202519089DBA7F4 |
| 58477 | 196535304722270564220F7 | 2011-11-09 08:04:44.707000000 | NaN | Completed | 2013-01-30 00:00:00 | B | NV | Professional | Full-time | True | False | NaN | 2011-11-09 08:04:42 | 1994-08-01 00:00:00 | $75,000-99,999 | True | 10B036352003963353536BE | 2011-11-15 00:00:00 | Q4 2011 | 57B53428129271396F60EA5 |
numeric_vars.sample(4)
| ListingNumber | Term | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | CurrentCreditLines | OpenCreditLines | TotalCreditLinespast7years | OpenRevolvingAccounts | OpenRevolvingMonthlyPayment | InquiriesLast6Months | TotalInquiries | CurrentDelinquencies | AmountDelinquent | DelinquenciesLast7Years | PublicRecordsLast10Years | PublicRecordsLast12Months | RevolvingCreditBalance | BankcardUtilization | AvailableBankcardCredit | TotalTrades | TradesNeverDelinquent (percentage) | TradesOpenedLast6Months | DebtToIncomeRatio | StatedMonthlyIncome | TotalProsperLoans | TotalProsperPaymentsBilled | OnTimeProsperPayments | ProsperPaymentsLessThanOneMonthLate | ProsperPaymentsOneMonthPlusLate | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | ScorexChangeAtTimeOfListing | LoanCurrentDaysDelinquent | LoanFirstDefaultedCycleNumber | LoanMonthsSinceOrigination | LoanNumber | LoanOriginalAmount | MonthlyLoanPayment | LP_CustomerPayments | LP_CustomerPrincipalPayments | LP_InterestandFees | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12517 | 353775 | 36 | 0.12866 | 0.1075 | 0.0975 | NaN | NaN | NaN | NaN | NaN | 1 | 82.0 | 660.0 | 679.0 | 9.0 | 6.0 | 14.0 | 6 | 38.0 | 1.0 | 3.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 881.0 | 0.18 | 2861.0 | 13.0 | 0.53 | 1.0 | 0.02 | 3333.333333 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1590 | 21.0 | 68 | 33573 | 1000 | 32.62 | 474.30 | 346.77 | 127.53 | -10.47 | 0.0 | 653.23 | 653.23 | 0.0 | 1.0 | 0 | 0 | 0.0 | 18 |
| 20348 | 315560 | 36 | 0.10843 | 0.1015 | 0.0915 | NaN | NaN | NaN | NaN | NaN | 3 | 20.0 | 800.0 | 819.0 | 9.0 | 9.0 | 19.0 | 4 | 78.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1784.0 | 0.28 | 4127.0 | 18.0 | 1.00 | 0.0 | 0.12 | 12500.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 71 | 30302 | 10000 | 323.38 | 11364.86 | 10000.00 | 1364.86 | -134.47 | 0.0 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 306 |
| 9409 | 587020 | 36 | 0.33973 | 0.2999 | 0.2899 | 0.2766 | 0.1490 | 0.1276 | 2.0 | 1.0 | 1 | 8.0 | 640.0 | 659.0 | 13.0 | 9.0 | 53.0 | 5 | 149.0 | 3.0 | 7.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4207.0 | 0.89 | 295.0 | 44.0 | 0.91 | 2.0 | 0.27 | 6333.333333 | 1.0 | 9.0 | 8.0 | 1.0 | 0.0 | 2500.0 | 2080.03 | -60.0 | 0 | NaN | 22 | 65300 | 4000 | 169.78 | 4485.75 | 4000.00 | 485.75 | -16.20 | 0.0 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 37 |
| 112822 | 733314 | 36 | 0.30285 | 0.2639 | 0.2539 | 0.2318 | 0.1225 | 0.1093 | 2.0 | 2.0 | 18 | 87.0 | 700.0 | 719.0 | 21.0 | 15.0 | 53.0 | 10 | 2043.0 | 1.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 72928.0 | 0.86 | 9214.0 | 53.0 | 1.00 | 2.0 | 0.42 | 7916.666667 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 12 | 87166 | 2500 | 101.25 | 2530.73 | 2500.00 | 30.73 | -1.16 | 0.0 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 33 |
On remarque que parmis les variables catégoriques on a des variables de type date qui devraient être convertis:
# convertir les dates
date_vars = ['ListingCreationDate', 'ClosedDate', 'DateCreditPulled', 'FirstRecordedCreditLine', 'LoanOriginationDate']
categ_vars.loc[:,date_vars] = categ_vars[date_vars].apply(pd.to_datetime)
# test
for col in date_vars:
assert is_datetime(categ_vars[col])
parmi les variables de type date on s'aperçcoit que quelques une ne nous serviront pas dans l'étude donc on les supprime:
# supprimer les colonnes inutiles
cols_to_drop = ['ClosedDate', 'DateCreditPulled', 'FirstRecordedCreditLine', 'ListingCreationDate']
categ_vars.drop(cols_to_drop, axis = 1, inplace = True)
# Test
for col in cols_to_drop:
assert not col in categ_vars
On remarque aussi que parmi les variables catégoriques on a des colonnes qui représentent des clés qui permettent d'identifier les listings, les crédits et les membres, on utilisera ces clés pour vérifier les doublons puis on les supprimera.
Autre colonnes à supprimer : 'CurrentlyInGroup','ProsperRating (Alpha)'
# vérifier les duplications par rapport à ListingKey
df_loan_status.ListingKey.duplicated().sum()
13
cols_to_drop = ['LoanKey', 'ListingKey', 'GroupKey', 'CurrentlyInGroup','ProsperRating (Alpha)',
'BorrowerState', 'LoanOriginationQuarter', 'IsBorrowerHomeowner', 'IncomeVerifiable']
categ_vars.drop(cols_to_drop, axis = 1, inplace =True)
# test
for col in cols_to_drop:
assert not col in categ_vars
categ_vars.sample(4)
| CreditGrade | LoanStatus | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | |
|---|---|---|---|---|---|---|---|
| 4744 | NaN | Completed | Professional | Employed | $25,000-49,999 | 2012-03-15 | 61AC35420399188547AAED1 |
| 70971 | D | Completed | Clerical | Not available | Not displayed | 2006-08-31 | B0D73365912571773F5E7DB |
| 83462 | C | Completed | Construction | Self-employed | $100,000+ | 2007-02-26 | CE513380337453493147B01 |
| 962 | A | Defaulted | Sales - Commission | Self-employed | $100,000+ | 2008-09-26 | 38E233647635464480C33A5 |
numeric_vars.sample(5)
| ListingNumber | Term | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | CurrentCreditLines | OpenCreditLines | TotalCreditLinespast7years | OpenRevolvingAccounts | OpenRevolvingMonthlyPayment | InquiriesLast6Months | TotalInquiries | CurrentDelinquencies | AmountDelinquent | DelinquenciesLast7Years | PublicRecordsLast10Years | PublicRecordsLast12Months | RevolvingCreditBalance | BankcardUtilization | AvailableBankcardCredit | TotalTrades | TradesNeverDelinquent (percentage) | TradesOpenedLast6Months | DebtToIncomeRatio | StatedMonthlyIncome | TotalProsperLoans | TotalProsperPaymentsBilled | OnTimeProsperPayments | ProsperPaymentsLessThanOneMonthLate | ProsperPaymentsOneMonthPlusLate | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | ScorexChangeAtTimeOfListing | LoanCurrentDaysDelinquent | LoanFirstDefaultedCycleNumber | LoanMonthsSinceOrigination | LoanNumber | LoanOriginalAmount | MonthlyLoanPayment | LP_CustomerPayments | LP_CustomerPrincipalPayments | LP_InterestandFees | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3369 | 174500 | 36 | 0.29776 | 0.2900 | 0.2800 | NaN | NaN | NaN | NaN | NaN | 0 | 0.0 | 600.0 | 619.0 | 7.0 | 5.0 | 26.0 | 3 | 55.0 | 2.0 | 9.0 | 3.0 | 10528.0 | 16.0 | 0.0 | 0.0 | 1612.0 | 0.23 | 4645.0 | 23.0 | 0.56 | 0.0 | 0.15000 | 7500.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1743 | 26.0 | 79 | 17559 | 10000 | 419.06 | 8800.26 | 4771.11 | 4029.15 | -138.94 | 0.0 | 5228.89 | 5228.89 | 0.0 | 1.0 | 1 | 0 | 0.0 | 106 |
| 99963 | 66371 | 36 | 0.16802 | 0.1500 | 0.1450 | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 560.0 | 579.0 | NaN | NaN | 37.0 | 3 | 57.0 | 7.0 | 19.0 | 7.0 | NaN | 18.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.09000 | 3250.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1884 | 30.0 | 87 | 5346 | 1000 | 34.67 | 832.08 | 615.84 | 216.24 | -7.20 | 0.0 | 384.16 | 384.16 | 0.0 | 1.0 | 0 | 0 | 0.0 | 43 |
| 101244 | 10282 | 36 | 0.22241 | 0.2150 | 0.2100 | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 620.0 | 639.0 | NaN | NaN | 9.0 | 0 | 0.0 | 4.0 | NaN | 2.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.05952 | 2500.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 94 | 816 | 3000 | 113.80 | 4044.73 | 3000.00 | 1044.73 | -24.30 | 0.0 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 0 | 0.0 | 29 |
| 19572 | 627057 | 36 | 0.35797 | 0.3177 | 0.3077 | 0.2896 | 0.1675 | 0.1221 | 1.0 | 4.0 | 1 | 25.0 | 660.0 | 679.0 | 2.0 | 2.0 | 5.0 | 1 | 30.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 44.0 | 0.14 | 256.0 | 5.0 | 1.00 | 1.0 | 1.12000 | 208.333333 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 286 | 13.0 | 19 | 73228 | 4000 | 173.71 | 1374.68 | 547.75 | 826.93 | -19.80 | 0.0 | 3452.25 | 3452.25 | 0.0 | 1.0 | 0 | 0 | 0.0 | 11 |
| 1228 | 429669 | 36 | 0.10846 | 0.1050 | 0.0950 | 0.0799 | 0.0150 | 0.0799 | 7.0 | 7.0 | 7 | 1.0 | 800.0 | 819.0 | 6.0 | 6.0 | 10.0 | 5 | 91.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2667.0 | 0.14 | 11983.0 | 9.0 | 1.00 | 0.0 | 0.09000 | 5416.666667 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 52 | 39305 | 12500 | 406.28 | 12857.43 | 12500.00 | 357.43 | -34.04 | 0.0 | 0.00 | 0.00 | 0.0 | 1.0 | 0 | 1 | 75.0 | 465 |
# colonnes à supprimer
cols_to_drop = [
'ListingNumber',
'BorrowerRate',
'LenderYield',
'DelinquenciesLast7Years',
'OpenRevolvingMonthlyPayment',
'RevolvingCreditBalance',
'EstimatedEffectiveYield',
'EstimatedLoss',
'Recommendations',
'EstimatedReturn',
'CurrentCreditLines',
'OpenCreditLines',
'TotalCreditLinespast7years',
'OpenRevolvingAccounts',
'InquiriesLast6Months',
'TotalInquiries',
'BankcardUtilization',
'CurrentDelinquencies',
'AmountDelinquent',
'PublicRecordsLast10Years',
'PublicRecordsLast12Months',
'TotalTrades',
'TradesOpenedLast6Months',
'AvailableBankcardCredit',
'OnTimeProsperPayments',
'ProsperPaymentsLessThanOneMonthLate',
'ProsperPaymentsOneMonthPlusLate',
'ScorexChangeAtTimeOfListing',
'LoanFirstDefaultedCycleNumber',
'LoanMonthsSinceOrigination',
'LoanNumber',
'LP_CustomerPayments',
'LP_CustomerPrincipalPayments',
'LP_InterestandFees',
'LP_ServiceFees',
'LP_CollectionFees',
'LP_GrossPrincipalLoss',
'LP_NetPrincipalLoss',
'LP_NonPrincipalRecoverypayments',
'PercentFunded',
'InvestmentFromFriendsCount',
'InvestmentFromFriendsAmount',
'Investors',
'TotalProsperLoans',
'TotalProsperPaymentsBilled',
'LoanCurrentDaysDelinquent'
]
numeric_vars.drop(cols_to_drop, axis=1, inplace=True)
# test
for col in cols_to_drop:
assert not col in numeric_vars
numeric_vars.sample(4)
| Term | BorrowerAPR | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 719 | 36 | 0.13905 | NaN | NaN | 0 | 28.0 | 640.0 | 659.0 | 0.75 | 0.16 | 3750.000000 | NaN | NaN | 6001 | 202.78 |
| 54831 | 36 | 0.29776 | NaN | NaN | 0 | NaN | 560.0 | 579.0 | NaN | 0.12 | 4583.333333 | NaN | NaN | 3000 | 125.72 |
| 102384 | 36 | 0.07439 | 7.0 | 10.0 | 1 | 19.0 | 740.0 | 759.0 | 0.88 | 0.07 | 8000.000000 | NaN | NaN | 2000 | 61.85 |
| 58419 | 36 | 0.21290 | 4.0 | 6.0 | 1 | 157.0 | 660.0 | 679.0 | 0.84 | 0.20 | 6666.666667 | NaN | NaN | 20000 | 719.04 |
# concatener categ_vars et numeric_vars
df_loan_status = pd.concat([categ_vars, numeric_vars], axis = 1)
# place target variable at the end of the dataframe
new_cols = df_loan_status.columns.drop(TARGET_NAME.values()).insert(df_loan_status.shape[1]-2,
TARGET_NAME['categ']).insert(df_loan_status.shape[1]-1, TARGET_NAME['num'])
df_loan_status = df_loan_status[new_cols]
df_loan_status.sample(4)
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 95989 | NaN | Professional | Employed | $75,000-99,999 | 2012-06-08 | 146C35353362647170571D9 | 60 | 4.0 | 7.0 | 1 | 4.0 | 660.0 | 679.0 | 1.00 | 0.15 | 6250.000000 | NaN | NaN | 4000 | 117.15 | Chargedoff | 0.27462 |
| 34730 | E | Professional | Full-time | $50,000-74,999 | 2007-07-05 | 26F233681011994197917C5 | 36 | NaN | NaN | 0 | 77.0 | 580.0 | 599.0 | 0.69 | 0.22 | 5964.833333 | NaN | NaN | 7650 | 280.42 | Completed | 0.20471 |
| 84883 | NaN | Other | Retired | $50,000-74,999 | 2009-11-06 | 89F63460549109128C5DEF0 | 36 | 7.0 | 7.0 | 7 | 85.0 | 820.0 | 839.0 | 1.00 | 0.34 | 5705.583333 | NaN | NaN | 15000 | 494.64 | Completed | 0.11848 |
| 10839 | NaN | Executive | Employed | $50,000-74,999 | 2011-11-15 | E7C7341663845386183DDF9 | 12 | 5.0 | 7.0 | 3 | 52.0 | 720.0 | 739.0 | 0.92 | 0.13 | 5833.333333 | 6500.0 | 0.0 | 3000 | 232.89 | Completed | 0.18899 |
df_loan_status
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C | Other | Self-employed | $25,000-49,999 | 2007-09-12 | 1F3E3376408759268057EDA | 36 | NaN | NaN | 0 | 2.0 | 640.0 | 659.0 | 0.81 | 0.17 | 3083.333333 | NaN | NaN | 9425 | 330.43 | Completed | 0.16516 |
| 2 | HR | Other | Not available | Not displayed | 2007-01-17 | 5F7033715035555618FA612 | 36 | NaN | NaN | 0 | NaN | 480.0 | 499.0 | NaN | 0.06 | 2083.333333 | NaN | NaN | 3001 | 123.32 | Completed | 0.28269 |
| 11 | C | Waiter/Waitress | Full-time | $1-24,999 | 2007-10-18 | 59423376541417552B94BD3 | 36 | NaN | NaN | 0 | 19.0 | 640.0 | 659.0 | 0.40 | 0.27 | 1666.666667 | NaN | NaN | 1000 | 33.81 | Completed | 0.15033 |
| 15 | NaN | Other | Other | $50,000-74,999 | 2012-04-19 | 924335448287456229847ED | 36 | 1.0 | 5.0 | 13 | 121.0 | 700.0 | 719.0 | 0.94 | 0.49 | 5500.000000 | NaN | NaN | 4000 | 173.71 | Defaulted | 0.35797 |
| 17 | AA | Professional | Not available | Not displayed | 2006-08-22 | 893E3365582060699EBBADA | 36 | NaN | NaN | 0 | NaN | 760.0 | 779.0 | NaN | 0.12 | 5833.333333 | NaN | NaN | 10000 | 334.54 | Chargedoff | 0.13202 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113926 | E | Other | Not available | Not displayed | 2006-08-08 | 24433364525134628C58A2F | 36 | NaN | NaN | 0 | NaN | 540.0 | 559.0 | NaN | 0.07 | 5416.666667 | NaN | NaN | 3000 | 125.72 | Defaulted | 0.29776 |
| 113927 | AA | Executive | Full-time | $100,000+ | 2008-05-09 | 5BAF33803622558831EA3AC | 36 | NaN | NaN | 4 | 69.0 | 760.0 | 779.0 | 1.00 | 0.06 | 10333.333333 | 1000.0 | 847.61 | 4292 | 132.11 | Completed | 0.07469 |
| 113928 | NaN | Other | Full-time | $25,000-49,999 | 2011-06-10 | 3A2F3380477699707C81385 | 36 | 4.0 | 6.0 | 3 | 22.0 | 740.0 | 759.0 | 1.00 | 0.27 | 2333.333333 | 14250.0 | 0.02 | 2000 | 73.30 | Completed | 0.22362 |
| 113929 | NaN | Accountant/CPA | Employed | $50,000-74,999 | 2013-07-10 | CBD43555702871369F462DA | 36 | 2.0 | 3.0 | 2 | 25.0 | 660.0 | 679.0 | 0.75 | 0.05 | 4333.333333 | NaN | NaN | 2500 | 101.25 | Completed | 0.30285 |
| 113935 | NaN | Food Service | Full-time | $25,000-49,999 | 2011-11-21 | A33834861822272782621C8 | 60 | 4.0 | 5.0 | 2 | 94.0 | 680.0 | 699.0 | 0.80 | 0.48 | 3875.000000 | 5000.0 | 3264.37 | 15000 | 449.55 | Completed | 0.28408 |
55084 rows × 22 columns
df_loan_status.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 55084 entries, 0 to 113935 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CreditGrade 28948 non-null object 1 Occupation 52819 non-null object 2 EmploymentStatus 52832 non-null object 3 IncomeRange 55084 non-null object 4 LoanOriginationDate 55084 non-null datetime64[ns] 5 MemberKey 55084 non-null object 6 Term 55084 non-null int64 7 ProsperRating (numeric) 26005 non-null float64 8 ProsperScore 26005 non-null float64 9 ListingCategory (numeric) 55084 non-null int64 10 EmploymentStatusDuration 47473 non-null float64 11 CreditScoreRangeLower 54494 non-null float64 12 CreditScoreRangeUpper 54494 non-null float64 13 TradesNeverDelinquent (percentage) 47544 non-null float64 14 DebtToIncomeRatio 50854 non-null float64 15 StatedMonthlyIncome 55084 non-null float64 16 ProsperPrincipalBorrowed 10539 non-null float64 17 ProsperPrincipalOutstanding 10539 non-null float64 18 LoanOriginalAmount 55084 non-null int64 19 MonthlyLoanPayment 55084 non-null float64 20 LoanStatus 55084 non-null object 21 BorrowerAPR 55059 non-null float64 dtypes: datetime64[ns](1), float64(12), int64(3), object(6) memory usage: 9.7+ MB
# vérification du pourcentage des valaurs manquantes
(df_loan_status.isnull().sum() / len(df_loan_status)).sort_values(ascending=False)
ProsperPrincipalOutstanding 0.808674 ProsperPrincipalBorrowed 0.808674 ProsperRating (numeric) 0.527903 ProsperScore 0.527903 CreditGrade 0.474475 EmploymentStatusDuration 0.138171 TradesNeverDelinquent (percentage) 0.136882 DebtToIncomeRatio 0.076792 Occupation 0.041119 EmploymentStatus 0.040883 CreditScoreRangeUpper 0.010711 CreditScoreRangeLower 0.010711 BorrowerAPR 0.000454 ListingCategory (numeric) 0.000000 Term 0.000000 MemberKey 0.000000 StatedMonthlyIncome 0.000000 LoanOriginationDate 0.000000 IncomeRange 0.000000 LoanOriginalAmount 0.000000 MonthlyLoanPayment 0.000000 LoanStatus 0.000000 dtype: float64
# affichage des variables catégorique avec leurs différentes valeurs unique
for col in df_loan_status.select_dtypes('object'):
print(f'{col:-<50} {df_loan_status[col].unique()}')
CreditGrade--------------------------------------- ['C' 'HR' nan 'AA' 'D' 'B' 'E' 'A' 'NC'] Occupation---------------------------------------- ['Other' 'Waiter/Waitress' 'Professional' 'Skilled Labor' 'Sales - Commission' 'Executive' nan 'Accountant/CPA' 'Construction' 'Analyst' "Nurse's Aide" 'Fireman' 'Realtor' 'Clerical' 'Laborer' 'Food Service Management' 'Truck Driver' 'Administrative Assistant' 'Police Officer/Correction Officer' 'Nurse (RN)' 'Social Worker' 'Computer Programmer' 'Military Officer' 'Sales - Retail' 'Military Enlisted' 'Food Service' 'Tradesman - Mechanic' 'Postal Service' 'Teacher' 'Pharmacist' 'Retail Management' 'Engineer - Mechanical' 'Dentist' 'Architect' 'Landscaping' 'Nurse (LPN)' 'Tradesman - Carpenter' 'Medical Technician' 'Tradesman - Plumber' 'Tradesman - Electrician' 'Bus Driver' 'Engineer - Chemical' 'Student - College Senior' 'Principal' 'Attorney' 'Scientist' 'Doctor' 'Pilot - Private/Commercial' 'Engineer - Electrical' 'Homemaker' 'Student - College Graduate Student' 'Civil Service' 'Student - Technical School' 'Psychologist' 'Biologist' 'Religious' 'Professor' 'Chemist' 'Student - College Sophomore' 'Clergy' 'Investor' 'Student - College Junior' 'Flight Attendant' 'Car Dealer' "Teacher's Aide" 'Student - Community College' 'Student - College Freshman' 'Judge'] EmploymentStatus---------------------------------- ['Self-employed' 'Not available' 'Full-time' 'Other' 'Employed' nan 'Not employed' 'Part-time' 'Retired'] IncomeRange--------------------------------------- ['$25,000-49,999' 'Not displayed' '$1-24,999' '$50,000-74,999' '$75,000-99,999' '$100,000+' 'Not employed' '$0'] MemberKey----------------------------------------- ['1F3E3376408759268057EDA' '5F7033715035555618FA612' '59423376541417552B94BD3' ... '2CC43431047896407BFDB84' '24433364525134628C58A2F' 'CBD43555702871369F462DA'] LoanStatus---------------------------------------- ['Completed' 'Defaulted' 'Chargedoff']
Pour CreditGrade la valeur NC ne figure pas parmis l'ensemble des valeurs definit au niveau du site web de propser.
Vérifions maintenant les lignes où IncomeRange = 0$:
df_loan_status.query('IncomeRange == "$0"').EmploymentStatus.value_counts(normalize = True).plot.bar()
<AxesSubplot:>
Presque 50% de ces cas ont des jobs full-time, donc pour les cas (full-time, part-time et retired) on replacera cette valeur par Not displayed.
Vérifions maintenant si les données entre occupation et statut d'emploiement sont cohérentes:
df_loan_status.query("EmploymentStatus == 'Not employed' & StatedMonthlyIncome != 0").Occupation.unique()
array(['Other', 'Student - College Graduate Student',
'Sales - Commission', 'Student - Community College',
'Psychologist', 'Student - College Senior',
'Student - College Junior', 'Professional',
'Student - College Sophomore', 'Analyst', "Teacher's Aide",
'Retail Management', 'Homemaker', 'Sales - Retail', "Nurse's Aide",
'Waiter/Waitress', 'Student - Technical School',
'Student - College Freshman', 'Skilled Labor'], dtype=object)
mask = (df_loan_status.EmploymentStatus == 'Not employed') & (~df_loan_status.Occupation.str.contains('Student|Other', regex=True, na=False))
df_loan_status.loc[mask, ["StatedMonthlyIncome", "IncomeRange"]]
| StatedMonthlyIncome | IncomeRange | |
|---|---|---|
| 3494 | 0.083333 | Not employed |
| 12976 | 0.083333 | Not employed |
| 23186 | 1820.000000 | Not employed |
| 30657 | 3333.333333 | Not employed |
| 46168 | 0.083333 | Not employed |
| 48124 | 0.083333 | Not displayed |
| 51391 | 0.000000 | Not employed |
| 51591 | 0.083333 | Not employed |
| 52485 | 0.083333 | Not employed |
| 59043 | 0.083333 | Not employed |
| 59790 | 500.000000 | Not employed |
| 63607 | 5833.333333 | Not employed |
| 79151 | 738.166667 | Not employed |
| 82145 | 20833.333333 | Not employed |
| 82933 | 1500.000000 | Not employed |
| 83221 | 20833.333333 | $100,000+ |
| 83675 | 0.000000 | Not employed |
| 99010 | 0.083333 | Not employed |
| 99066 | 6666.666667 | Not employed |
| 105503 | 1916.666667 | Not employed |
| 109111 | 0.083333 | Not employed |
Pour les occupation différentes de (student et other) où le statut d'emploiement est Not employed on mettra plutot Not available, il va falloir aussi corriger la valeur de IncomeRange. Vérifions maitenant le cas inverse :
mask = (df_loan_status.EmploymentStatus == 'Not employed') & (df_loan_status.Occupation.str.contains('Student|Other', regex=True, na=False)) & (df_loan_status.IncomeRange == "Not employed")
print(f'Income Range : {df_loan_status.loc[mask, "IncomeRange"].unique()}')
month_income = df_loan_status.loc[mask, "StatedMonthlyIncome"]
month_income[month_income > 0]
Income Range : ['Not employed']
66 860.000000 1242 0.083333 1925 0.083333 2394 0.083333 3821 1211.666667 4683 623.000000 4791 2926.000000 9564 833.333333 9911 1000.000000 10949 583.333333 13647 1505.000000 14599 175.000000 18423 3500.000000 19622 0.083333 20103 3633.000000 20499 833.333333 21386 2167.583333 22905 125.250000 25650 856.000000 26669 1360.000000 28185 828.000000 28949 37.500000 37625 0.083333 37896 583.333333 38889 2775.916667 42332 4126.416667 46467 56.250000 49435 0.083333 49741 0.083333 54517 0.083333 54605 868.000000 57163 2166.666667 60735 900.000000 67076 2025.000000 76229 1014.000000 79698 643.000000 80471 2348.000000 82130 0.083333 85716 0.083333 86265 1400.000000 87620 1250.000000 88171 393.416667 88504 839.666667 88628 964.333333 90975 0.083333 92119 4000.000000 92622 57.583333 92666 480.000000 93230 1507.000000 96181 430.000000 98204 669.000000 104014 875.000000 104483 0.083333 105687 9096.000000 105951 1173.000000 106081 3000.000000 107228 4166.666667 107590 0.083333 109052 1409.250000 109600 1000.000000 110804 0.083333 112778 2972.000000 113692 1289.000000 Name: StatedMonthlyIncome, dtype: float64
Il va falloir aussi corriger les valeurs de IncomeRange, statut d'emploiement et StatedMonthlyIncome des occupations (Student et Other) pour les lignes où le salaire mensuel est différent de 0.
df_loan_status.describe()
| Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 55084.000000 | 26005.000000 | 26005.000000 | 55084.000000 | 47473.000000 | 54494.000000 | 54494.000000 | 47544.000000 | 50854.000000 | 55084.000000 | 10539.000000 | 10539.000000 | 55084.000000 | 55084.000000 | 55059.000000 |
| mean | 36.942851 | 3.662911 | 6.265987 | 2.407578 | 80.895878 | 671.746247 | 690.746247 | 0.856399 | 0.289866 | 5054.723018 | 7104.678315 | 2332.674796 | 6261.892746 | 223.729882 | 0.222194 |
| std | 7.333480 | 1.814450 | 2.319612 | 3.397823 | 84.669661 | 81.908137 | 81.908137 | 0.174057 | 0.736738 | 5711.051112 | 6395.297342 | 3285.130455 | 5217.758039 | 185.727523 | 0.088177 |
| min | 12.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 19.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1000.000000 | 0.000000 | 0.006530 |
| 25% | 36.000000 | 2.000000 | 5.000000 | 0.000000 | 21.000000 | 640.000000 | 659.000000 | 0.770000 | 0.130000 | 2810.604167 | 3000.000000 | 0.000000 | 2600.000000 | 98.290000 | 0.149740 |
| 50% | 36.000000 | 3.000000 | 6.000000 | 1.000000 | 52.000000 | 680.000000 | 699.000000 | 0.920000 | 0.200000 | 4166.666667 | 5000.000000 | 1098.020000 | 4500.000000 | 172.600000 | 0.214340 |
| 75% | 36.000000 | 5.000000 | 8.000000 | 3.000000 | 112.000000 | 720.000000 | 739.000000 | 1.000000 | 0.300000 | 6250.000000 | 9500.000000 | 3382.560000 | 8000.000000 | 300.472500 | 0.295100 |
| max | 60.000000 | 7.000000 | 11.000000 | 20.000000 | 755.000000 | 880.000000 | 899.000000 | 1.000000 | 10.010000 | 618547.833333 | 60001.000000 | 22586.670000 | 35000.000000 | 2251.510000 | 0.512290 |
Le min de MonthlyLoanPayment est 0 ce qui semble un peu bizarre
df_loan_status.query('MonthlyLoanPayment == 0')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | NaN | Executive | Full-time | $75,000-99,999 | 2010-06-24 | B39A347662239828607EAC0 | 36 | 6.0 | 9.0 | 1 | 10.0 | 700.0 | 719.0 | 1.00 | 0.26 | 6666.666667 | NaN | NaN | 4000 | 0.0 | Completed | 0.11296 |
| 66 | D | Other | Not employed | Not employed | 2007-04-17 | 344C3384310377315F39A56 | 36 | NaN | NaN | 0 | 1.0 | 620.0 | 639.0 | 0.53 | 0.20 | 860.000000 | NaN | NaN | 1000 | 0.0 | Completed | 0.16802 |
| 113 | HR | Other | Self-employed | $50,000-74,999 | 2007-02-28 | 8A8C3382615470032545608 | 36 | NaN | NaN | 0 | 37.0 | 520.0 | 539.0 | 0.06 | 0.04 | 4166.666667 | NaN | NaN | 3000 | 0.0 | Completed | 0.30564 |
| 198 | NaN | Professional | Employed | $100,000+ | 2010-08-09 | 5709339069157083673EE2F | 36 | 3.0 | 8.0 | 7 | 3.0 | 660.0 | 679.0 | 1.00 | 0.12 | 10000.000000 | 3000.0 | 0.00 | 3000 | 0.0 | Completed | 0.29265 |
| 310 | D | NaN | NaN | Not displayed | 2006-07-03 | 12A73366266037057B67D88 | 36 | NaN | NaN | 0 | NaN | 600.0 | 619.0 | NaN | 0.16 | 3750.000000 | NaN | NaN | 1800 | 0.0 | Completed | 0.12724 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113588 | D | Student - College Graduate Student | Part-time | $1-24,999 | 2007-06-26 | A79E3382252261518835F53 | 36 | NaN | NaN | 0 | 2.0 | 600.0 | 619.0 | 0.43 | 0.50 | 800.000000 | NaN | NaN | 1258 | 0.0 | Completed | 0.19957 |
| 113614 | HR | Professional | Full-time | $25,000-49,999 | 2008-03-05 | 93833407310815443F14F22 | 36 | NaN | NaN | 1 | 2.0 | 520.0 | 539.0 | 0.38 | 0.21 | 4000.000000 | NaN | NaN | 1500 | 0.0 | Defaulted | 0.37453 |
| 113717 | NaN | Professor | Employed | $50,000-74,999 | 2010-10-28 | 570633900248261228954D4 | 36 | 6.0 | 9.0 | 1 | 2.0 | 760.0 | 779.0 | 1.00 | 0.18 | 5072.833333 | 13000.0 | 0.01 | 4900 | 0.0 | Completed | 0.12663 |
| 113840 | NaN | Teacher's Aide | Full-time | $1-24,999 | 2009-12-14 | A29433928941492074677BF | 36 | 4.0 | 10.0 | 7 | 21.0 | 660.0 | 679.0 | 0.58 | 0.19 | 2000.000000 | 1000.0 | 328.41 | 1400 | 0.0 | Defaulted | 0.22674 |
| 113848 | NC | Administrative Assistant | Not available | Not displayed | 2007-02-13 | 936433800851164203C5765 | 36 | NaN | NaN | 0 | NaN | 0.0 | 19.0 | NaN | NaN | 833.333333 | NaN | NaN | 1000 | 0.0 | Completed | 0.30709 |
931 rows × 22 columns
df_loan_status.MonthlyLoanPayment.isna().sum()
0
Le payement mensuel est censé refleter le montant que le créditeur doit payer chaque mois, on suppose donc que 0 signifie une valeur manqaunte.
Vérifions maintenant la colonne liés aux statut d'emploiement :
df_loan_status.query('EmploymentStatus == "Full-time"').Occupation.unique()
array(['Waiter/Waitress', 'Professional', 'Executive', 'Accountant/CPA',
'Other', "Nurse's Aide", 'Analyst', 'Realtor',
'Food Service Management', 'Police Officer/Correction Officer',
'Nurse (RN)', 'Computer Programmer', 'Military Officer',
'Sales - Retail', 'Administrative Assistant', 'Military Enlisted',
'Sales - Commission', 'Tradesman - Mechanic', 'Skilled Labor',
'Pharmacist', 'Clerical', 'Landscaping', 'Construction', 'Laborer',
'Tradesman - Carpenter', 'Medical Technician', 'Retail Management',
'Food Service', 'Postal Service', 'Teacher', 'Tradesman - Plumber',
'Architect', 'Engineer - Chemical', 'Scientist', 'Doctor',
'Engineer - Electrical', 'Truck Driver', 'Engineer - Mechanical',
'Attorney', 'Fireman', 'Student - College Graduate Student',
'Tradesman - Electrician', 'Civil Service',
'Pilot - Private/Commercial', 'Nurse (LPN)', 'Principal',
'Religious', 'Chemist', 'Professor', 'Clergy',
'Student - College Junior', 'Bus Driver', 'Flight Attendant',
'Social Worker', "Teacher's Aide", 'Psychologist',
'Student - College Senior', 'Student - Community College',
'Car Dealer', 'Biologist', 'Student - College Freshman',
'Homemaker', 'Dentist', 'Student - College Sophomore',
'Student - Technical School', 'Investor', 'Judge'], dtype=object)
fulltime_students = df_loan_status[(df_loan_status.EmploymentStatus == "Full-time") & (df_loan_status.Occupation.str.contains('Student'))]
fulltime_students.sample(10)
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11757 | AA | Student - College Graduate Student | Full-time | $25,000-49,999 | 2007-04-30 | 10D73380714543112C251DF | 36 | NaN | NaN | 0 | 2.0 | 780.0 | 799.0 | 1.00 | 0.03 | 2333.333333 | NaN | NaN | 1527 | 47.15 | Completed | 0.08118 |
| 61956 | NaN | Student - College Graduate Student | Full-time | $1-24,999 | 2010-05-05 | D0BD3478926638550FC5584 | 36 | 3.0 | 8.0 | 7 | 32.0 | 680.0 | 699.0 | 1.00 | 0.07 | 1833.333333 | NaN | NaN | 2000 | 75.25 | Completed | 0.23149 |
| 74204 | HR | Student - Technical School | Full-time | $1-24,999 | 2007-02-27 | 012F33818408799074FC4DC | 36 | NaN | NaN | 0 | 0.0 | 520.0 | 539.0 | 0.39 | 0.16 | 1666.666667 | NaN | NaN | 1000 | 40.83 | Completed | 0.28938 |
| 65489 | D | Student - College Junior | Full-time | $1-24,999 | 2007-03-15 | 95B1337820323886171787D | 36 | NaN | NaN | 0 | 30.0 | 600.0 | 619.0 | 0.75 | 0.37 | 583.333333 | NaN | NaN | 2500 | 83.63 | Completed | 0.13202 |
| 86880 | C | Student - College Sophomore | Full-time | $1-24,999 | 2008-06-05 | B65D3420607371082EC7329 | 36 | NaN | NaN | 1 | 35.0 | 640.0 | 659.0 | 1.00 | 0.59 | 933.333333 | NaN | NaN | 5000 | 198.80 | Chargedoff | 0.27306 |
| 16074 | NaN | Student - College Freshman | Full-time | $1-24,999 | 2010-05-26 | BF83348442178996542BA76 | 36 | 3.0 | 7.0 | 5 | 8.0 | 700.0 | 719.0 | 1.00 | NaN | 583.333333 | NaN | NaN | 3000 | 121.27 | Chargedoff | 0.28574 |
| 98315 | HR | Student - College Graduate Student | Full-time | $1-24,999 | 2008-06-03 | F66734149708290659AE11F | 36 | NaN | NaN | 4 | 21.0 | 520.0 | 539.0 | 0.96 | 0.34 | 1833.333333 | NaN | NaN | 1000 | 45.24 | Completed | 0.37453 |
| 54188 | AA | Student - College Graduate Student | Full-time | $1-24,999 | 2007-06-19 | DBDE339026494101914EC99 | 36 | NaN | NaN | 0 | 48.0 | 780.0 | 799.0 | 1.00 | 0.36 | 1416.666667 | NaN | NaN | 2500 | 79.50 | Completed | 0.09688 |
| 101840 | HR | Student - College Junior | Full-time | $1-24,999 | 2007-09-28 | 535A3388261526159B6C80D | 36 | NaN | NaN | 0 | 11.0 | 540.0 | 559.0 | 0.66 | 0.21 | 1250.000000 | NaN | NaN | 1500 | 58.85 | Chargedoff | 0.25517 |
| 45377 | B | Student - College Senior | Full-time | $1-24,999 | 2008-04-17 | 68E83418509001663CEF3AD | 36 | NaN | NaN | 4 | 43.0 | 700.0 | 719.0 | 1.00 | 0.11 | 1000.000000 | NaN | NaN | 3000 | 97.86 | Completed | 0.12150 |
fulltime_students.StatedMonthlyIncome.describe()
count 142.000000 mean 1605.001174 std 1020.211506 min 0.000000 25% 841.895833 50% 1666.666667 75% 2166.666667 max 5000.000000 Name: StatedMonthlyIncome, dtype: float64
Entre occupation, EmployementStatus, IncomeRange et StatedMonthlyIncome on voit qu'on a des inconsistences, par exemple pour des étudiants on mentionne que le type d'emploiement est full-time, alors que quand on vérifie les salaires 75% des salaires ne dépassent oas 2300$. Pour vérifier quel colonne contient des informations erroné on investiguera encore d'autres occupations:
df_nurse = df_loan_status.query('Occupation == "Nurse (RN)"')
print(df_nurse.EmploymentStatus.unique())
print(df_nurse.IncomeRange.value_counts())
df_nurse.StatedMonthlyIncome.describe()
df_nurse.query('EmploymentStatus == "Self-employed"')
['Full-time' 'Employed' 'Self-employed' 'Part-time' 'Not available' 'Retired'] $50,000-74,999 336 $75,000-99,999 210 $100,000+ 113 $25,000-49,999 94 Not displayed 49 $1-24,999 10 $0 1 Name: IncomeRange, dtype: int64
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2191 | D | Nurse (RN) | Self-employed | $50,000-74,999 | 2008-06-09 | 3B0834210783478202EF2DC | 36 | NaN | NaN | 1 | 26.0 | 600.0 | 619.0 | 0.95 | 0.24 | 6000.000000 | NaN | NaN | 6000 | 109.65 | Completed | 0.16461 |
| 19502 | AA | Nurse (RN) | Self-employed | $1-24,999 | 2007-03-29 | 8199338342171527151E75B | 36 | NaN | NaN | 0 | 2.0 | 820.0 | 839.0 | 1.00 | 10.01 | 0.083333 | NaN | NaN | 25000 | 840.73 | Completed | 0.15713 |
| 33057 | NaN | Nurse (RN) | Self-employed | $50,000-74,999 | 2012-09-27 | FFBB3559080848532CB1EC2 | 36 | 1.0 | 4.0 | 7 | 15.0 | 760.0 | 779.0 | 0.82 | NaN | 4166.666667 | NaN | NaN | 4000 | 173.71 | Chargedoff | 0.35797 |
| 48469 | NaN | Nurse (RN) | Self-employed | $100,000+ | 2011-02-18 | 3CB43507259349379B0B7FB | 36 | 2.0 | 5.0 | 3 | 82.0 | 660.0 | 679.0 | 0.64 | NaN | 100000.000000 | NaN | NaN | 6000 | 261.29 | Completed | 0.35643 |
| 52371 | D | Nurse (RN) | Self-employed | $25,000-49,999 | 2007-04-17 | 507D3383570364562E1C534 | 36 | NaN | NaN | 0 | 19.0 | 620.0 | 639.0 | 0.61 | 0.37 | 2583.333333 | NaN | NaN | 4500 | 135.79 | Completed | 0.15713 |
| 71040 | B | Nurse (RN) | Self-employed | $100,000+ | 2008-01-22 | 86FD3408891087383B6F275 | 36 | NaN | NaN | 4 | 57.0 | 680.0 | 699.0 | 0.76 | 0.15 | 8333.333333 | NaN | NaN | 3500 | 64.26 | Completed | 0.12402 |
| 88576 | HR | Nurse (RN) | Self-employed | $50,000-74,999 | 2007-04-11 | 37733371187658441A5B141 | 36 | NaN | NaN | 0 | 33.0 | 540.0 | 559.0 | 0.63 | 0.20 | 4602.416667 | NaN | NaN | 3999 | 161.12 | Chargedoff | 0.27535 |
| 98607 | B | Nurse (RN) | Self-employed | $50,000-74,999 | 2008-08-01 | 5C0A3427356939661C6A212 | 36 | NaN | NaN | 5 | 25.0 | 700.0 | 719.0 | 1.00 | NaN | 4666.666667 | NaN | NaN | 10000 | 344.94 | Completed | 0.16083 |
Il apparait que la marge salariale de StatedMonthlyIncome est raisonable sauf quelque outliers qu'on traitera dans la partie exploration, donc on corrigera les valeurs de EmploymentStatus pour les étudiants.
Vérifions maintenant la colonne liés aux salaires :
df_loan_status.query('StatedMonthlyIncome == 0').IncomeRange.value_counts(normalize=True).plot.bar()
<AxesSubplot:>
df_loan_status.query('StatedMonthlyIncome == 0 & IncomeRange == "$1-24,999"')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 43087 | A | Sales - Retail | Full-time | $1-24,999 | 2007-07-10 | 738F339146142197430B6FF | 36 | NaN | NaN | 0 | 24.0 | 740.0 | 759.0 | 1.00 | 0.20 | 0.0 | NaN | NaN | 3820 | 126.88 | Completed | 0.12700 |
| 55170 | B | Other | Full-time | $1-24,999 | 2007-06-11 | 9B4C33910242409617B06CA | 36 | NaN | NaN | 0 | 53.0 | 680.0 | 699.0 | 0.96 | 10.01 | 0.0 | NaN | NaN | 7500 | 312.26 | Completed | 0.29274 |
Apparement le salaire mensuel pour ces deux lignes est inconnu puisque le statut d'emploiement est full-time.
df_loan_status.query('StatedMonthlyIncome == 0').EmploymentStatus.value_counts(normalize=True).plot.bar()
<AxesSubplot:>
On remarque qu'on a des statuts d'emploiements : Full-time, part-time, et employed parmi les lignes où le salaire mensuel est défint par 0. Voyons le contenu des lignes correspondantes:
df_income_zero_with_job = df_loan_status.query('StatedMonthlyIncome == 0 & EmploymentStatus in ["Full-time","Part-time","Employed","Retired"]')
df_income_zero_with_job
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 78 | A | Realtor | Full-time | $0 | 2008-07-08 | D77734052911579724BCDD5 | 36 | NaN | NaN | 3 | 68.0 | 720.0 | 739.0 | 1.00 | NaN | 0.0 | 7750.0 | 6725.84 | 4800 | 171.13 | Chargedoff | 0.18454 |
| 108 | C | Executive | Full-time | $0 | 2008-05-09 | 34F93419082777341558B78 | 36 | NaN | NaN | 1 | 27.0 | 640.0 | 659.0 | 0.85 | NaN | 0.0 | NaN | NaN | 14000 | 564.07 | Completed | 0.28320 |
| 772 | AA | Other | Full-time | $0 | 2008-04-15 | 33313390658251495A144FE | 36 | NaN | NaN | 1 | 2.0 | 820.0 | 839.0 | 1.00 | NaN | 0.0 | NaN | NaN | 3500 | 112.61 | Completed | 0.10491 |
| 954 | C | Principal | Full-time | $0 | 2008-04-24 | AEA63413280733493B66661 | 36 | NaN | NaN | 3 | 54.0 | 660.0 | 679.0 | 0.88 | NaN | 0.0 | NaN | NaN | 7500 | 306.19 | Completed | 0.27767 |
| 1822 | E | Retail Management | Full-time | $0 | 2007-12-18 | FB9733654055843608D0707 | 36 | NaN | NaN | 2 | 170.0 | 580.0 | 599.0 | 0.74 | NaN | 0.0 | 1000.0 | 590.65 | 1250 | 43.67 | Completed | 0.16991 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112398 | AA | Other | Retired | $0 | 2007-05-30 | E4D23379032731659A8AF0B | 36 | NaN | NaN | 0 | 87.0 | 860.0 | 879.0 | 1.00 | NaN | 0.0 | NaN | NaN | 1000 | 30.88 | Completed | 0.08718 |
| 113409 | B | Other | Full-time | $0 | 2007-11-23 | F62433681152495415568C2 | 36 | NaN | NaN | 0 | 7.0 | 680.0 | 699.0 | 0.94 | NaN | 0.0 | NaN | NaN | 7000 | 209.80 | Completed | 0.05672 |
| 113485 | B | Skilled Labor | Full-time | $0 | 2008-06-09 | D8CF3392717841450B750B9 | 36 | NaN | NaN | 4 | 12.0 | 680.0 | 699.0 | 1.00 | NaN | 0.0 | 10000.0 | 7768.47 | 6000 | 213.28 | Completed | 0.18756 |
| 113608 | NaN | Sales - Commission | Employed | $0 | 2011-07-29 | DD963503228294113D59593 | 60 | 4.0 | 6.0 | 1 | 8.0 | 660.0 | 679.0 | 1.00 | NaN | 0.0 | 10000.0 | 8623.87 | 8000 | 220.91 | Chargedoff | 0.24252 |
| 113686 | A | Other | Full-time | $0 | 2007-12-13 | F6EA34048867335192E230D | 36 | NaN | NaN | 0 | 26.0 | 740.0 | 759.0 | 1.00 | NaN | 0.0 | NaN | NaN | 14900 | 560.21 | Completed | 0.21588 |
344 rows × 22 columns
la plupart de ces lignes ont un IncomeRange = 0$, on considère donc que ce sont des valeurs inconnu à supprimer.
Aux états unis le salaire minimal est définit à 7.25$ par heure et un emplois full-time correspond à un minimum d'heures 30 heures par semaine ce qui donne un salaire minimum par mois de 870$, vérifions si c'est le cas au niveau de ce dataset :
df_loan_status.query('StatedMonthlyIncome < 870 & EmploymentStatus == "Full-time"')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 78 | A | Realtor | Full-time | $0 | 2008-07-08 | D77734052911579724BCDD5 | 36 | NaN | NaN | 3 | 68.0 | 720.0 | 739.0 | 1.00 | NaN | 0.000000 | 7750.0 | 6725.84 | 4800 | 171.13 | Chargedoff | 0.18454 |
| 108 | C | Executive | Full-time | $0 | 2008-05-09 | 34F93419082777341558B78 | 36 | NaN | NaN | 1 | 27.0 | 640.0 | 659.0 | 0.85 | NaN | 0.000000 | NaN | NaN | 14000 | 564.07 | Completed | 0.28320 |
| 226 | D | Other | Full-time | Not displayed | 2007-03-07 | 0E243382750780180A98377 | 36 | NaN | NaN | 0 | 9.0 | 600.0 | 619.0 | 0.95 | 10.01 | 0.083333 | NaN | NaN | 1000 | 32.27 | Completed | 0.11749 |
| 772 | AA | Other | Full-time | $0 | 2008-04-15 | 33313390658251495A144FE | 36 | NaN | NaN | 1 | 2.0 | 820.0 | 839.0 | 1.00 | NaN | 0.000000 | NaN | NaN | 3500 | 112.61 | Completed | 0.10491 |
| 954 | C | Principal | Full-time | $0 | 2008-04-24 | AEA63413280733493B66661 | 36 | NaN | NaN | 3 | 54.0 | 660.0 | 679.0 | 0.88 | NaN | 0.000000 | NaN | NaN | 7500 | 306.19 | Completed | 0.27767 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112712 | D | Other | Full-time | $1-24,999 | 2008-01-23 | 7B8A33660740310807D7132 | 36 | NaN | NaN | 3 | 13.0 | 620.0 | 639.0 | 1.00 | 10.01 | 0.083333 | 4800.0 | 3957.83 | 2850 | 111.81 | Chargedoff | 0.25517 |
| 113407 | C | Other | Full-time | $1-24,999 | 2008-02-15 | AD173403480233281BF3FE1 | 36 | NaN | NaN | 4 | 25.0 | 640.0 | 659.0 | 0.50 | 2.16 | 233.333333 | NaN | NaN | 1386 | 54.12 | Completed | 0.25163 |
| 113409 | B | Other | Full-time | $0 | 2007-11-23 | F62433681152495415568C2 | 36 | NaN | NaN | 0 | 7.0 | 680.0 | 699.0 | 0.94 | NaN | 0.000000 | NaN | NaN | 7000 | 209.80 | Completed | 0.05672 |
| 113485 | B | Skilled Labor | Full-time | $0 | 2008-06-09 | D8CF3392717841450B750B9 | 36 | NaN | NaN | 4 | 12.0 | 680.0 | 699.0 | 1.00 | NaN | 0.000000 | 10000.0 | 7768.47 | 6000 | 213.28 | Completed | 0.18756 |
| 113686 | A | Other | Full-time | $0 | 2007-12-13 | F6EA34048867335192E230D | 36 | NaN | NaN | 0 | 26.0 | 740.0 | 759.0 | 1.00 | NaN | 0.000000 | NaN | NaN | 14900 | 560.21 | Completed | 0.21588 |
501 rows × 22 columns
On remarque qu'il y a des salaires < 800 pour des job full-time et qui ont des occupations qui correspondent à des salaires minimum bien supérieur à ce qui est définit. Là on a du doute entre quel valeur est incorrecte (statut d'emploiement, occupation ou le salaire mensuel) donc on supprimera ces lignes puisqu'elle ne sont pas assez nombreuse.
On suppose que quelqu'un aui prendra un crédit aura au moins 100$ par mois. Vérifions les salaires < 100:
df_loan_status.query('StatedMonthlyIncome < 100')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 78 | A | Realtor | Full-time | $0 | 2008-07-08 | D77734052911579724BCDD5 | 36 | NaN | NaN | 3 | 68.0 | 720.0 | 739.0 | 1.00 | NaN | 0.000000 | 7750.0 | 6725.84 | 4800 | 171.13 | Chargedoff | 0.18454 |
| 100 | NaN | Other | Not employed | Not employed | 2013-01-30 | CEE83539008163122ED9B62 | 36 | 1.0 | 3.0 | 13 | 12.0 | 720.0 | 739.0 | 1.00 | NaN | 0.000000 | 5000.0 | 3802.10 | 4000 | 172.76 | Chargedoff | 0.35356 |
| 108 | C | Executive | Full-time | $0 | 2008-05-09 | 34F93419082777341558B78 | 36 | NaN | NaN | 1 | 27.0 | 640.0 | 659.0 | 0.85 | NaN | 0.000000 | NaN | NaN | 14000 | 564.07 | Completed | 0.28320 |
| 146 | A | NaN | NaN | Not displayed | 2006-08-29 | 862A3365213342789F57EB7 | 36 | NaN | NaN | 0 | NaN | 740.0 | 759.0 | NaN | 0.07 | 0.083333 | NaN | NaN | 4000 | 127.66 | Completed | 0.09939 |
| 170 | NaN | Other | Not employed | Not employed | 2012-04-17 | 0B3A35450946276237DF676 | 36 | 3.0 | 5.0 | 7 | 9.0 | 760.0 | 779.0 | 1.00 | NaN | 0.000000 | NaN | NaN | 10000 | 397.55 | Completed | 0.28851 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113626 | A | NaN | NaN | Not displayed | 2006-09-05 | 0668336539701022236C70C | 36 | NaN | NaN | 0 | NaN | 720.0 | 739.0 | NaN | 10.01 | 0.083333 | NaN | NaN | 25000 | 891.32 | Completed | 0.17722 |
| 113682 | NaN | Other | Not employed | Not employed | 2012-01-17 | EBE5351272522740954AE3B | 60 | 3.0 | 5.0 | 1 | 87.0 | 760.0 | 779.0 | 1.00 | NaN | 0.000000 | NaN | NaN | 10000 | 323.35 | Chargedoff | 0.32680 |
| 113686 | A | Other | Full-time | $0 | 2007-12-13 | F6EA34048867335192E230D | 36 | NaN | NaN | 0 | 26.0 | 740.0 | 759.0 | 1.00 | NaN | 0.000000 | NaN | NaN | 14900 | 560.21 | Completed | 0.21588 |
| 113767 | HR | Other | Self-employed | $0 | 2008-02-04 | 48573410660081558A9CA02 | 36 | NaN | NaN | 3 | 22.0 | 520.0 | 539.0 | 0.00 | NaN | 0.000000 | NaN | NaN | 5000 | 224.77 | Chargedoff | 0.36945 |
| 113803 | B | Sales - Retail | Self-employed | $0 | 2008-08-12 | 57DF33738770980933C5777 | 36 | NaN | NaN | 7 | 34.0 | 680.0 | 699.0 | 0.84 | NaN | 0.000000 | 1000.0 | 724.82 | 5000 | 161.10 | Defaulted | 0.11293 |
1422 rows × 22 columns
Comme dernière vérification, on doute si les 0 des colonnes numeriques sont effectivement des 0 ou ils représentent des valeurs manquantes, c'est ce qu'on verifiera par la suite :
df_loan_status.describe()
| Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 55084.000000 | 26005.000000 | 26005.000000 | 55084.000000 | 47473.000000 | 54494.000000 | 54494.000000 | 47544.000000 | 50854.000000 | 55084.000000 | 10539.000000 | 10539.000000 | 55084.000000 | 55084.000000 | 55059.000000 |
| mean | 36.942851 | 3.662911 | 6.265987 | 2.407578 | 80.895878 | 671.746247 | 690.746247 | 0.856399 | 0.289866 | 5054.723018 | 7104.678315 | 2332.674796 | 6261.892746 | 223.729882 | 0.222194 |
| std | 7.333480 | 1.814450 | 2.319612 | 3.397823 | 84.669661 | 81.908137 | 81.908137 | 0.174057 | 0.736738 | 5711.051112 | 6395.297342 | 3285.130455 | 5217.758039 | 185.727523 | 0.088177 |
| min | 12.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 19.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1000.000000 | 0.000000 | 0.006530 |
| 25% | 36.000000 | 2.000000 | 5.000000 | 0.000000 | 21.000000 | 640.000000 | 659.000000 | 0.770000 | 0.130000 | 2810.604167 | 3000.000000 | 0.000000 | 2600.000000 | 98.290000 | 0.149740 |
| 50% | 36.000000 | 3.000000 | 6.000000 | 1.000000 | 52.000000 | 680.000000 | 699.000000 | 0.920000 | 0.200000 | 4166.666667 | 5000.000000 | 1098.020000 | 4500.000000 | 172.600000 | 0.214340 |
| 75% | 36.000000 | 5.000000 | 8.000000 | 3.000000 | 112.000000 | 720.000000 | 739.000000 | 1.000000 | 0.300000 | 6250.000000 | 9500.000000 | 3382.560000 | 8000.000000 | 300.472500 | 0.295100 |
| max | 60.000000 | 7.000000 | 11.000000 | 20.000000 | 755.000000 | 880.000000 | 899.000000 | 1.000000 | 10.010000 | 618547.833333 | 60001.000000 | 22586.670000 | 35000.000000 | 2251.510000 | 0.512290 |
Les variables à verifier sont les suivantes : EmploymentStatusDuration, TradesNeverDelinquent, DebtToIncomeRatio.
Afin de vérifier le cas de la variable TradesNeverDelinquent on se basera sur les valeurs de CreditScoreRangeLower, si ces valeurs sonyt assez faible donc il est possible que le crediteur est toujours en délinquences par rapport ses payements.
df_loan_status.loc[df_loan_status['TradesNeverDelinquent (percentage)'] == 0,'CreditScoreRangeLower'].describe()
count 51.000000 mean 518.039216 std 116.017578 min 0.000000 25% 520.000000 50% 520.000000 75% 530.000000 max 740.000000 Name: CreditScoreRangeLower, dtype: float64
La medianne est à 520 qui est considéré comme une valeur faible, par conséquent ces valeurs ne reprsentent pas des valeurs manquantes.
Passons maintenant à la variable EmploymentStatusDuration:
df_loan_status.query('EmploymentStatusDuration == 0 & EmploymentStatus != "Not employed"')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | EmploymentStatusDuration | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 443 | A | Sales - Commission | Full-time | $25,000-49,999 | 2007-12-12 | 493E33654938822447A0B4F | 36 | NaN | NaN | 0 | 0.0 | 740.0 | 759.0 | 1.00 | 0.20 | 2166.666667 | 2550.0 | 1724.48 | 3750 | 120.12 | Completed | 0.10190 |
| 589 | D | Professional | Full-time | $25,000-49,999 | 2007-03-13 | 80513367885883028D9BA1F | 36 | NaN | NaN | 0 | 0.0 | 600.0 | 619.0 | 0.70 | 0.46 | 4000.000000 | NaN | NaN | 2550 | 92.83 | Defaulted | 0.19228 |
| 754 | NaN | Professional | Employed | $25,000-49,999 | 2012-10-24 | E5583560494700733E7F662 | 36 | 6.0 | 9.0 | 3 | 0.0 | 800.0 | 819.0 | 1.00 | 0.17 | 2583.333333 | NaN | NaN | 10000 | 321.45 | Completed | 0.12528 |
| 804 | NaN | Other | Employed | $25,000-49,999 | 2011-06-27 | FF8A351838845452653814E | 36 | 3.0 | 6.0 | 7 | 0.0 | 680.0 | 699.0 | 1.00 | NaN | 3833.333333 | NaN | NaN | 2000 | 81.64 | Completed | 0.30532 |
| 991 | NaN | Computer Programmer | Employed | $75,000-99,999 | 2013-03-01 | EDA73571034718063A03677 | 12 | 5.0 | 4.0 | 1 | 0.0 | 700.0 | 719.0 | 1.00 | 0.28 | 6416.666667 | NaN | NaN | 4981 | 440.46 | Completed | 0.16803 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112252 | E | Dentist | Full-time | $0 | 2008-07-01 | 22D73423483179178AC7B09 | 36 | NaN | NaN | 4 | 0.0 | 560.0 | 579.0 | 0.73 | NaN | 0.000000 | NaN | NaN | 3000 | 119.28 | Completed | 0.27306 |
| 112959 | AA | Executive | Full-time | $100,000+ | 2007-03-08 | 770F3378836893721C4950D | 36 | NaN | NaN | 0 | 0.0 | 760.0 | 779.0 | 0.88 | 0.12 | 5416.666667 | NaN | NaN | 20000 | 131.43 | Defaulted | 0.10441 |
| 113125 | E | Analyst | Full-time | $25,000-49,999 | 2007-12-19 | 13D433987929061856E0050 | 36 | NaN | NaN | 4 | 0.0 | 580.0 | 599.0 | 1.00 | 0.14 | 2416.666667 | NaN | NaN | 1250 | 46.45 | Completed | 0.21480 |
| 113466 | NaN | Other | Other | $50,000-74,999 | 2012-05-24 | 2DCD35480331567288CA0FA | 36 | 4.0 | 6.0 | 1 | 0.0 | 740.0 | 759.0 | 1.00 | NaN | 4166.666667 | NaN | NaN | 5000 | 189.61 | Completed | 0.25259 |
| 113503 | NaN | Professional | Full-time | $1-24,999 | 2010-02-25 | 5DB434754656845987B6323 | 36 | 3.0 | 7.0 | 7 | 0.0 | 760.0 | 779.0 | 1.00 | NaN | 2000.000000 | NaN | NaN | 3200 | 142.95 | Chargedoff | 0.36438 |
577 rows × 22 columns
Apparement il y a pas mal de ligne où la durée d'emploiement est definit à 0 alors que les statut d'emploiement est différent de Not employed. On supprimera donc cette colonne puisqu'elle n'est pas vraiment primordiale pour l'analyse.
df_loan_status.drop('EmploymentStatusDuration', axis = 1, inplace = True)
# test
assert 'EmploymentStatusDuration' not in df_loan_status.columns.values
On passe maintenant à la vérfication de la variable DebtToIncomeRatio:
df_loan_status.query('DebtToIncomeRatio == 0')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 935 | AA | Other | Full-time | $100,000+ | 2008-03-28 | 7A4B3414926714371A530E6 | 36 | NaN | NaN | 4 | 860.0 | 879.0 | 1.00 | 0.0 | 9000.000000 | NaN | NaN | 1000 | 29.97 | Completed | 0.06688 |
| 8066 | NaN | Police Officer/Correction Officer | Employed | $100,000+ | 2011-04-26 | 4A4635120865562207697F6 | 36 | 1.0 | 4.0 | 1 | 660.0 | 679.0 | 0.83 | 0.0 | 416666.666667 | NaN | NaN | 2000 | 87.10 | Completed | 0.35643 |
| 11312 | AA | Realtor | Full-time | Not displayed | 2007-03-21 | D5D73379897640778CE86DC | 36 | NaN | NaN | 0 | 780.0 | 799.0 | 0.75 | 0.0 | 75000.000000 | NaN | NaN | 3000 | 94.01 | Completed | 0.08684 |
| 17411 | NaN | Professional | Employed | $100,000+ | 2011-05-06 | 70343511744352918C2AEA6 | 36 | 2.0 | 5.0 | 1 | 660.0 | 679.0 | 0.62 | 0.0 | 483333.333333 | NaN | NaN | 7500 | 326.62 | Completed | 0.35643 |
| 19798 | E | Computer Programmer | Not available | Not displayed | 2006-12-15 | 8DC8337350298591354E1B8 | 36 | NaN | NaN | 0 | 540.0 | 559.0 | NaN | 0.0 | 10000.000000 | NaN | NaN | 1200 | 42.78 | Completed | 0.18515 |
| 42672 | AA | Analyst | Full-time | $100,000+ | 2007-07-16 | F4203393331145016D31D4A | 36 | NaN | NaN | 0 | 780.0 | 799.0 | 1.00 | 0.0 | 15833.333333 | NaN | NaN | 1000 | 31.18 | Completed | 0.09375 |
| 59329 | NaN | Professional | Employed | $100,000+ | 2011-05-02 | 55BB35113494085578B7FFE | 36 | 5.0 | 8.0 | 7 | 740.0 | 759.0 | 0.80 | 0.0 | 28333.333333 | NaN | NaN | 2000 | 68.35 | Completed | 0.16147 |
| 63123 | AA | Computer Programmer | Not available | Not displayed | 2006-11-21 | CE4C337184565383654AD8B | 36 | NaN | NaN | 0 | 860.0 | 879.0 | NaN | 0.0 | 16666.666667 | NaN | NaN | 1000 | 29.97 | Completed | 0.06698 |
| 65521 | HR | Executive | Not available | Not displayed | 2006-10-27 | 68F033656864123684CBBBE | 36 | NaN | NaN | 0 | 520.0 | 539.0 | NaN | 0.0 | 6600.000000 | NaN | NaN | 5950 | 249.34 | Defaulted | 0.29776 |
| 76903 | AA | NaN | NaN | Not displayed | 2006-07-11 | 63CD3364918051654E38AA5 | 36 | NaN | NaN | 0 | 760.0 | 779.0 | NaN | 0.0 | 25000.000000 | NaN | NaN | 2550 | 76.43 | Completed | 0.05672 |
| 78431 | C | Executive | Full-time | $100,000+ | 2007-08-20 | 4CD73396243292237260213 | 36 | NaN | NaN | 0 | 640.0 | 659.0 | 1.00 | 0.0 | 103334.083333 | NaN | NaN | 5000 | 184.60 | Chargedoff | 0.20253 |
| 82754 | B | Executive | Not available | Not displayed | 2006-08-29 | DB3233663569909464FE268 | 36 | NaN | NaN | 0 | 680.0 | 699.0 | NaN | 0.0 | 14166.666667 | NaN | NaN | 1000 | 32.03 | Completed | 0.11244 |
| 83697 | AA | Other | Not available | Not displayed | 2007-02-15 | 4A98337712025383188AA5C | 36 | NaN | NaN | 0 | 840.0 | 859.0 | NaN | 0.0 | 185081.750000 | NaN | NaN | 5000 | 156.68 | Completed | 0.08684 |
| 97558 | HR | NaN | NaN | Not displayed | 2006-07-07 | 77E033648541512633997FA | 36 | NaN | NaN | 0 | 520.0 | 539.0 | NaN | 0.0 | 6000.000000 | NaN | NaN | 1200 | 38.02 | Defaulted | 0.10192 |
| 97784 | NaN | Other | Full-time | $100,000+ | 2010-02-26 | B9D43476517621544B10EF2 | 36 | 4.0 | 9.0 | 2 | 680.0 | 699.0 | 0.61 | 0.0 | 45833.333333 | NaN | NaN | 3500 | 129.89 | Chargedoff | 0.22135 |
| 102090 | NC | Other | Not available | Not displayed | 2006-08-07 | DEE23366444392492A10BBF | 36 | NaN | NaN | 0 | 0.0 | 19.0 | NaN | 0.0 | 33333.333333 | NaN | NaN | 3600 | 118.71 | Defaulted | 0.12198 |
| 103704 | AA | Computer Programmer | Not available | Not displayed | 2006-09-21 | 0C713365918393496936356 | 36 | NaN | NaN | 0 | 780.0 | 799.0 | NaN | 0.0 | 6875.000000 | NaN | NaN | 2526 | 79.74 | Completed | 0.09186 |
Normalement cette variable est calculé comme étant le ratio de ce que la personne a comme crédit sur comme bien elle gagne, si c'est 0 donc cette personne n'a aucun crédit alors que d'après le résultat précedent ce n'est pas le cas, donc on supprimera ces lignes.
Voyons maintenant la valeur min du taux d'intérêt :
df_loan_status.BorrowerAPR.describe()
count 55059.000000 mean 0.222194 std 0.088177 min 0.006530 25% 0.149740 50% 0.214340 75% 0.295100 max 0.512290 Name: BorrowerAPR, dtype: float64
df_loan_status.query('BorrowerAPR < 0.01')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ProsperRating (numeric) | ProsperScore | ListingCategory (numeric) | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53558 | HR | Clerical | Not available | Not displayed | 2006-10-18 | DA973371389610730AA66DF | 36 | NaN | NaN | 0 | 480.0 | 499.0 | NaN | 0.04 | 1733.333333 | NaN | NaN | 2600 | 72.46 | Defaulted | 0.00864 |
| 76858 | C | Food Service Management | Full-time | $1-24,999 | 2007-03-27 | F0AC3377846854835F2B667 | 36 | NaN | NaN | 0 | 660.0 | 679.0 | 0.33 | 0.06 | 1833.333333 | NaN | NaN | 3000 | 83.33 | Completed | 0.00653 |
| 78401 | D | Other | Not available | Not displayed | 2006-10-23 | 8C9E337004758482904CED7 | 36 | NaN | NaN | 0 | 600.0 | 619.0 | NaN | 0.11 | 3500.000000 | NaN | NaN | 5000 | 138.89 | Completed | 0.00653 |
Il apparait qu'on a un seul enregistrement avec un taux d'intérêt < 0.01 et qui a une valeur très faible par rapport au reste des valeurs malgrès que le créditeur ne montre pas un profil exceptionnel. On supprimera donc cette ligne.
Problèmes de qualité :
- Données manquantes
- Mettre les variables ordinale dans le bon ordre
- Valeur de rating
NCinconnue- La valeur
Not employeddeIncomerangedevrait être 0 et les valeurs ne doivent pas contenir le symbole$.- Corriger le nom des colonnes qui ont des espaces.
- Pour les données manquantes de la colonne ListingCategory on a deux valuers qui représentent des données manquantes : Not available et other.
- Corriger
EmploymentStatusdes étudiantsStatedMonthlyIncome== 0 alors que l'occupation est full-time etIncomeRangeest > 1- Des salaires < 870 alors que le statut d'emploiement est full-time
- DebtToIncomeRatio == 0 pour des montants de crédit > 0
- Les lignes du dataframe
df_income_zero_with_jobont un salaire mensuel inconnu.- Remplacer
IncomeStatus== $0 où le statut d'emploiement est (full-time, part-time, retired) parNot displayed.- Supprimer les lignes ayant un salaire mensuel < 100
- Statut d'emploiement
Not employedalors que l'occupation est différente de (student et other) -> corriger statut d'emploiement et income range.- Corriger les valeurs de
IncomeRangeetEmploymentStatusdes occupations (Student et Other) pour les lignes où le salaire mensuel est différent de 0.- Deux valeurs inconnu pour
EmploymentStatus(Not available et Other)- Taux d'intérêt très faible à la ligne 76858.
df_income_zero_with_job ont un salaire mensuel inconnu.¶Remplacer ces valeurs par des Nan.
df_loan_status.loc[df_income_zero_with_job.index, "StatedMonthlyIncome"] = np.nan
# test
assert df_loan_status.loc[df_income_zero_with_job.index].StatedMonthlyIncome.isnull().sum() == len(df_income_zero_with_job)
NR à la place des nan pour credit grade (Not Rated)Not available pour employemnt statusNot displayed pour income rangePour le reste des colonnes on supprimera celles qui ont plus de 50% des données manquantes, puis pour le reste des colonnes qui n'ont pas trop de valeurs manquantes on supprimera les lignes ayant das valeurs manquantes.
df_loan_status = df_loan_status.copy()
df_loan_status.CreditGrade.replace(np.nan, 'NR', inplace = True)
df_loan_status.EmploymentStatus.replace(np.nan, 'Not available', inplace = True)
df_loan_status.IncomeRange.replace(np.nan, 'Not displayed', inplace = True)
THRESHOLD = 0.5
isnull_cols = df_loan_status.isnull().sum()/len(df_loan_status)
cols_to_keep = isnull_cols[isnull_cols <= THRESHOLD].index
df_loan_status = df_loan_status[cols_to_keep]
df_loan_status.dropna(inplace = True)
# test
for col in df_loan_status.columns:
assert df_loan_status[col].isnull().sum() == 0
NC inconnue¶Remplacer cette valeur par NR.
df_loan_status.CreditGrade.replace('NC', 'NR', inplace = True)
# Test
assert 'NC' not in df_loan_status.CreditGrade.values
Not employed de Incomerange devrait être 0 et les valeurs ne doivent pas contenir le symbole $¶Remplacer cette valeur par 0 et supprimer le symbole $
df_loan_status.IncomeRange.replace('Not employed', '0', inplace = True)
df_loan_status.IncomeRange = df_loan_status.IncomeRange.str.strip('$')
# test
assert 'Not employed' not in df_loan_status.IncomeRange.values
assert df_loan_status.IncomeRange.str.contains('$', regex = False).sum() == 0
EmploymentStatus (Not available et Other)¶df_loan_status.EmploymentStatus.replace("Other", "Not available", inplace = True)
# Test
assert "Other" not in df_loan_status.EmploymentStatus.unique()
Not employed alors que l'occupation est différente de (student et other)¶mask = (df_loan_status.EmploymentStatus == "Not employed") & (~df_loan_status.Occupation.str.contains('Student|Other', regex = True, na=False))
df_loan_status.loc[mask, ["EmploymentStatus", "IncomeRange", "Occupation"]]
| EmploymentStatus | IncomeRange | Occupation | |
|---|---|---|---|
| 3494 | Not employed | 0 | Sales - Commission |
| 12976 | Not employed | 0 | Psychologist |
| 23186 | Not employed | 0 | Professional |
| 30657 | Not employed | 0 | Analyst |
| 46168 | Not employed | 0 | Teacher's Aide |
| 48124 | Not employed | Not displayed | Retail Management |
| 51591 | Not employed | 0 | Homemaker |
| 52485 | Not employed | 0 | Sales - Retail |
| 59043 | Not employed | 0 | Homemaker |
| 59790 | Not employed | 0 | Homemaker |
| 63607 | Not employed | 0 | Retail Management |
| 79151 | Not employed | 0 | Nurse's Aide |
| 82145 | Not employed | 0 | Homemaker |
| 82933 | Not employed | 0 | Waiter/Waitress |
| 83221 | Not employed | 100,000+ | Homemaker |
| 99010 | Not employed | 0 | Homemaker |
| 99066 | Not employed | 0 | Skilled Labor |
| 105503 | Not employed | 0 | Skilled Labor |
| 109111 | Not employed | 0 | Homemaker |
df_loan_status.loc[mask, ["IncomeRange", "EmploymentStatus"]] = ["Not displayed","Not available"]
# Test
assert df_loan_status.loc[mask, "IncomeRange"].unique() == ["Not displayed"]
assert df_loan_status.loc[mask, "EmploymentStatus"].unique() == ["Not available"]
df_loan_status.loc[mask, ["EmploymentStatus", "IncomeRange", "Occupation"]]
| EmploymentStatus | IncomeRange | Occupation | |
|---|---|---|---|
| 3494 | Not available | Not displayed | Sales - Commission |
| 12976 | Not available | Not displayed | Psychologist |
| 23186 | Not available | Not displayed | Professional |
| 30657 | Not available | Not displayed | Analyst |
| 46168 | Not available | Not displayed | Teacher's Aide |
| 48124 | Not available | Not displayed | Retail Management |
| 51591 | Not available | Not displayed | Homemaker |
| 52485 | Not available | Not displayed | Sales - Retail |
| 59043 | Not available | Not displayed | Homemaker |
| 59790 | Not available | Not displayed | Homemaker |
| 63607 | Not available | Not displayed | Retail Management |
| 79151 | Not available | Not displayed | Nurse's Aide |
| 82145 | Not available | Not displayed | Homemaker |
| 82933 | Not available | Not displayed | Waiter/Waitress |
| 83221 | Not available | Not displayed | Homemaker |
| 99010 | Not available | Not displayed | Homemaker |
| 99066 | Not available | Not displayed | Skilled Labor |
| 105503 | Not available | Not displayed | Skilled Labor |
| 109111 | Not available | Not displayed | Homemaker |
Mettre les varaibles ordinales en ordre
# variables ordinales : CreditGrade, IncomeRange
ordianl_cols = {
'CreditGrade' : ['NR','HR', 'E', 'D', 'C', 'B', 'A', 'AA'],
'IncomeRange' : ['Not displayed','Not employed', '0', '1-24,999', '25,000-49,999', '50,000-74,999', '75,000-99,999', '100,000+']
}
for col, values in ordianl_cols.items():
ordered_var = pd.api.types.CategoricalDtype(ordered = True, categories=values)
df_loan_status[col] = df_loan_status[col].astype(ordered_var)
# test
for col in ordianl_cols.keys():
print(f'{col:-<20}{df_loan_status[col].unique()}')
CreditGrade---------['C', 'NR', 'D', 'E', 'HR', 'A', 'B', 'AA'] Categories (8, object): ['NR' < 'HR' < 'E' < 'D' < 'C' < 'B' < 'A' < 'AA'] IncomeRange---------['25,000-49,999', '1-24,999', '50,000-74,999', '75,000-99,999', '100,000+', '0', 'Not displayed'] Categories (8, object): ['Not displayed' < 'Not employed' < '0' < '1-24,999' < '25,000-49,999' < '50,000-74,999' < '75,000-99,999' < '100,000+']
Laisser uniquement la première valeur.
df_loan_status.sample(1)
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory (numeric) | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent (percentage) | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2000 | HR | Administrative Assistant | Full-time | 50,000-74,999 | 2008-05-01 | 1F1534171072937048914C4 | 36 | 1 | 540.0 | 559.0 | 0.84 | 0.16 | 5416.666667 | 2700 | 101.72 | Completed | 0.2325 |
df_loan_status.rename(lambda x : x.split()[0], axis = 1,inplace = True)
# test
for col in df_loan_status.columns.values:
len(col.split()) == 1
Garder uniquement la valeur Other.
df_loan_status.ListingCategory.replace(0,7, inplace = True)
# test
assert df_loan_status.query('ListingCategory == 0').shape[0] == 0
EmploymentStatus des étudiants¶df_loan_status.loc[df_loan_status.Occupation.str.contains('Student'), 'EmploymentStatus'] = "Part-time"
# test
assert df_loan_status.loc[df_loan_status.Occupation.str.contains('Student'), 'EmploymentStatus'].unique() == ['Part-time']
StatedMonthlyIncome == 0 alors que l'occupation est full-time et IncomeRange est > 1¶Supprimer les deux lignes concerné.
df_loan_status.drop(df_loan_status.query('StatedMonthlyIncome == 0').index,axis = 0, inplace = True)
# test
assert df_loan_status.query('StatedMonthlyIncome == 0').shape[0] == 0
Supprimer ces lignes.
df_loan_status.drop(df_loan_status.query('StatedMonthlyIncome < 870 & EmploymentStatus == "Full-time"').index, axis = 0, inplace = True)
#test
assert df_loan_status.query('StatedMonthlyIncome < 870 & EmploymentStatus == "Full-time"').shape[0] == 0
Supprimer ces lignes
df_loan_status.drop(df_loan_status.query('DebtToIncomeRatio == 0').index, axis = 0, inplace = True)
# test
assert df_loan_status.query('DebtToIncomeRatio == 0').shape[0] == 0
IncomeStatus == $0 pour le statut d'emploiement (full-time, part-time, retired)¶Remplacer ces valeurs par Not displayed.
mask = (df_loan_status.IncomeRange == "0") & (df_loan_status.EmploymentStatus.isin(["Full-time","Part-time","Retired"]))
df_loan_status.loc[mask, "IncomeRange"] = "Not displayed"
# test
assert df_loan_status.query('IncomeRange == "0" & EmploymentStatus in ["Full-time","Part-time","Retired"]').shape[0] == 0
Suprimer ces lignes.
df_loan_status.drop(df_loan_status.query('StatedMonthlyIncome < 100').index, axis=0, inplace = True)
#test
assert df_loan_status.query('StatedMonthlyIncome < 100').shape[0] == 0
IncomeRange et EmploymentStatus des occupations (Student et Other) pour les lignes où le salaire mensuel est différent de 0.¶IncomeRange et la valeur Not available pour EmploymetStatusmask = (df_loan_status.EmploymentStatus == "Not employed") & (df_loan_status.Occupation.str.contains('Student|Other', regex = True, na=False)) & (df_loan_status.StatedMonthlyIncome > 0)
df_loan_status.loc[mask, ["EmploymentStatus", "IncomeRange", "StatedMonthlyIncome"]]
| EmploymentStatus | IncomeRange | StatedMonthlyIncome | |
|---|---|---|---|
| 66 | Not employed | 0 | 860.000000 |
| 4683 | Not employed | 0 | 623.000000 |
| 4791 | Not employed | 0 | 2926.000000 |
| 9564 | Not employed | 0 | 833.333333 |
| 9911 | Not employed | 0 | 1000.000000 |
| 10949 | Not employed | 0 | 583.333333 |
| 14599 | Not employed | 0 | 175.000000 |
| 15088 | Not employed | 1-24,999 | 1421.000000 |
| 18423 | Not employed | 0 | 3500.000000 |
| 19100 | Not employed | 1-24,999 | 303.500000 |
| 20103 | Not employed | 0 | 3633.000000 |
| 22873 | Not employed | 25,000-49,999 | 3833.333333 |
| 22905 | Not employed | 0 | 125.250000 |
| 25650 | Not employed | 0 | 856.000000 |
| 26669 | Not employed | 0 | 1360.000000 |
| 28164 | Not employed | 25,000-49,999 | 3820.000000 |
| 28185 | Not employed | 0 | 828.000000 |
| 38889 | Not employed | 0 | 2775.916667 |
| 42332 | Not employed | 0 | 4126.416667 |
| 54605 | Not employed | 0 | 868.000000 |
| 55381 | Not employed | 1-24,999 | 560.000000 |
| 57163 | Not employed | 0 | 2166.666667 |
| 60735 | Not employed | 0 | 900.000000 |
| 64388 | Not employed | 1-24,999 | 1028.000000 |
| 67076 | Not employed | 0 | 2025.000000 |
| 68426 | Not employed | 25,000-49,999 | 2750.000000 |
| 76229 | Not employed | 0 | 1014.000000 |
| 79698 | Not employed | 0 | 643.000000 |
| 80471 | Not employed | 0 | 2348.000000 |
| 83721 | Not employed | Not displayed | 600.000000 |
| 86265 | Not employed | 0 | 1400.000000 |
| 87620 | Not employed | 0 | 1250.000000 |
| 88171 | Not employed | 0 | 393.416667 |
| 88504 | Not employed | 0 | 839.666667 |
| 88628 | Not employed | 0 | 964.333333 |
| 89811 | Not employed | 25,000-49,999 | 2685.000000 |
| 92119 | Not employed | 0 | 4000.000000 |
| 92994 | Not employed | 1-24,999 | 315.416667 |
| 93230 | Not employed | 0 | 1507.000000 |
| 95718 | Not employed | 1-24,999 | 2000.000000 |
| 98204 | Not employed | 0 | 669.000000 |
| 104014 | Not employed | 0 | 875.000000 |
| 105687 | Not employed | 0 | 9096.000000 |
| 106081 | Not employed | 0 | 3000.000000 |
| 107228 | Not employed | 0 | 4166.666667 |
| 109052 | Not employed | 0 | 1409.250000 |
| 109600 | Not employed | 0 | 1000.000000 |
| 112778 | Not employed | 0 | 2972.000000 |
| 113692 | Not employed | 0 | 1289.000000 |
df_loan_status.loc[mask, "EmploymentStatus"] = "Not available"
def set_income_range(data):
if data.StatedMonthlyIncome < 2500:
return df_loan_status.IncomeRange.cat.categories[3]
elif (data.StatedMonthlyIncome > 2500) & (data.StatedMonthlyIncome < 5000):
return df_loan_status.IncomeRange.cat.categories[4]
elif (data.StatedMonthlyIncome > 5000 )& (data.StatedMonthlyIncome < 7500):
return df_loan_status.IncomeRange.cat.categories[5]
elif (data.StatedMonthlyIncome > 7500) & (data.StatedMonthlyIncome < 9000):
return df_loan_status.IncomeRange.cat.categories[6]
else:
return df_loan_status.IncomeRange.cat.categories[7]
df_loan_status.loc[mask, "IncomeRange"] = df_loan_status.loc[mask, ["StatedMonthlyIncome", "IncomeRange"]].apply(set_income_range, axis=1)
df_loan_status.loc[mask, ["StatedMonthlyIncome", "IncomeRange", "EmploymentStatus"]]
| StatedMonthlyIncome | IncomeRange | EmploymentStatus | |
|---|---|---|---|
| 66 | 860.000000 | 1-24,999 | Not available |
| 4683 | 623.000000 | 1-24,999 | Not available |
| 4791 | 2926.000000 | 25,000-49,999 | Not available |
| 9564 | 833.333333 | 1-24,999 | Not available |
| 9911 | 1000.000000 | 1-24,999 | Not available |
| 10949 | 583.333333 | 1-24,999 | Not available |
| 14599 | 175.000000 | 1-24,999 | Not available |
| 15088 | 1421.000000 | 1-24,999 | Not available |
| 18423 | 3500.000000 | 25,000-49,999 | Not available |
| 19100 | 303.500000 | 1-24,999 | Not available |
| 20103 | 3633.000000 | 25,000-49,999 | Not available |
| 22873 | 3833.333333 | 25,000-49,999 | Not available |
| 22905 | 125.250000 | 1-24,999 | Not available |
| 25650 | 856.000000 | 1-24,999 | Not available |
| 26669 | 1360.000000 | 1-24,999 | Not available |
| 28164 | 3820.000000 | 25,000-49,999 | Not available |
| 28185 | 828.000000 | 1-24,999 | Not available |
| 38889 | 2775.916667 | 25,000-49,999 | Not available |
| 42332 | 4126.416667 | 25,000-49,999 | Not available |
| 54605 | 868.000000 | 1-24,999 | Not available |
| 55381 | 560.000000 | 1-24,999 | Not available |
| 57163 | 2166.666667 | 1-24,999 | Not available |
| 60735 | 900.000000 | 1-24,999 | Not available |
| 64388 | 1028.000000 | 1-24,999 | Not available |
| 67076 | 2025.000000 | 1-24,999 | Not available |
| 68426 | 2750.000000 | 25,000-49,999 | Not available |
| 76229 | 1014.000000 | 1-24,999 | Not available |
| 79698 | 643.000000 | 1-24,999 | Not available |
| 80471 | 2348.000000 | 1-24,999 | Not available |
| 83721 | 600.000000 | 1-24,999 | Not available |
| 86265 | 1400.000000 | 1-24,999 | Not available |
| 87620 | 1250.000000 | 1-24,999 | Not available |
| 88171 | 393.416667 | 1-24,999 | Not available |
| 88504 | 839.666667 | 1-24,999 | Not available |
| 88628 | 964.333333 | 1-24,999 | Not available |
| 89811 | 2685.000000 | 25,000-49,999 | Not available |
| 92119 | 4000.000000 | 25,000-49,999 | Not available |
| 92994 | 315.416667 | 1-24,999 | Not available |
| 93230 | 1507.000000 | 1-24,999 | Not available |
| 95718 | 2000.000000 | 1-24,999 | Not available |
| 98204 | 669.000000 | 1-24,999 | Not available |
| 104014 | 875.000000 | 1-24,999 | Not available |
| 105687 | 9096.000000 | 100,000+ | Not available |
| 106081 | 3000.000000 | 25,000-49,999 | Not available |
| 107228 | 4166.666667 | 25,000-49,999 | Not available |
| 109052 | 1409.250000 | 1-24,999 | Not available |
| 109600 | 1000.000000 | 1-24,999 | Not available |
| 112778 | 2972.000000 | 25,000-49,999 | Not available |
| 113692 | 1289.000000 | 1-24,999 | Not available |
df_loan_status.drop(76858, axis=0, inplace=True)
# test
assert 76858 not in df_loan_status.index.values
df_loan_status.shape
(43079, 17)
df_loan_status.to_csv("data/prosperLoanDataCleaned.csv", index=False)
le dataset contient 113937 lignes et 81 colonnes, dont la majorité est de type quantitatif(61) alors que 20 d'entre eux sont de type qualitatif.
Dans cette étude on s'interesse plutot à étudier les variables qui impactent le résultat du statut de crédit et l' APR, donc les colonnes principales seront
LoanStatusetBorrowerAPR.
On suspecte que les variables qui sont en relation avec le statut du créditeur comme son salaire et les variables lié à son historique des crédits, aussi la date de l'obtention de crédit (qui peut être lié à une période d'inflation) auront le plus d'impacte sur le status final du crédit. Donc les variables qu'on a choisit sont les suivantes : CreditGrade, Occupation, EmploymentStatus, IncomeRange, LoanOriginationDate, Term, ListingCategory, CreditScoreRangeLower, CreditScoreRangeUpper, TradesNeverDelinquent, DebtToIncomeRatio, StatedMonthlyIncome, LoanOriginalAmount, MonthlyLoanPayment, Recommendations.
On commencera d'abord par visualiser la variable LoanStatus:
plt.figure(figsize=(5,5))
x = df_loan_status['LoanStatus'].value_counts()
plt.pie(x.values, autopct='%.1f', labels = x.index)
plt.title("Pourcentage des prêts selon leurs statuts")
plt.show()
Il apparait qu'on a peu d'echantillons avec le statut Defaulted par rapport à ceux avec le statut Completed.
Vérifions maintenant la distribution du taux de crédit :
df_loan_status['BorrowerAPR'].skew(), df_loan_status['BorrowerAPR'].kurtosis()
(0.1710431129095633, -1.127273866695665)
Il apparait qu'on a une distribution legèrement symmetrique, et fortement applatit. Pour determiner le nombre de bins optimal on utilisera la règle de : Freedman–Diaconis
def freedman_diaconis(data, return_as='width'):
"""Computes the optimal bins width
Args:
data (pd.Series): The dataframe column that we wish to plot a histogram for
returnas: {"width", "bins"}
If "width", return the estimated width for each histogram bin.
If "bins", return the number of bins suggested by rule.
"""
IQR = data.quantile(0.75) - data.quantile(0.25)
width= 2 * IQR * np.power(len(data), -(1/3))
if return_as == "width":
return width
else:
return np.arange(data.min(), data.max()+np.round(width, 3), np.round(width, 3))
sns.histplot(df_loan_status['BorrowerAPR'], bins = freedman_diaconis(df_loan_status['BorrowerAPR'], 'bins'), kde=True)
plt.title("Distribution du taux d'intérêt")
plt.show()
Il semble que la distribution des taux de crédit est applatit et qu'il n y a pas de valeurs extremes, et qu'on a un mode au niveau de la valeur 0.35.
On passera maintenant à évaluer les variables catégoriques:
fig, axs = plt.subplots(3,1, figsize = [15,15])
default_color = sns.color_palette()[0]
# on tracera des bar charts pour chaque variable
sns.countplot(data = df_loan_status, x = 'CreditGrade', ax = axs[0], color = default_color).set(title="Proportions des notes des créditeurs")
sns.countplot(data = df_loan_status, x = 'IncomeRange', ax = axs[1], color = default_color).set(title="Proportions des salaires annuels")
sns.countplot(data = df_loan_status, x = 'EmploymentStatus', ax = axs[2], color = default_color, order=df_loan_status.EmploymentStatus.value_counts().index).set(title="Proportions des statuts d'emploiement")
plt.show()
On remarque que dans ce dataset la plupart des créditeurs sont des employé à temps plein avec des salaires superieur à 25000 ce qui explique le grand pourcentage du statut Completed qu'on a observé précedement.
Passons maitenant à l' évaluation des variables numériques:
discrete_vars = df_loan_status.select_dtypes('int')
discrete_vars.describe()
| Term | ListingCategory | LoanOriginalAmount | |
|---|---|---|---|
| count | 43079.000000 | 43079.000000 | 43079.000000 |
| mean | 37.109775 | 4.166113 | 6513.869542 |
| std | 7.839136 | 3.516073 | 5273.484398 |
| min | 12.000000 | 1.000000 | 1000.000000 |
| 25% | 36.000000 | 1.000000 | 3000.000000 |
| 50% | 36.000000 | 3.000000 | 5000.000000 |
| 75% | 36.000000 | 7.000000 | 8500.000000 |
| max | 60.000000 | 20.000000 | 35000.000000 |
# colonnes : Term / ListingCategory
fig, axs = plt.subplots(1,2, figsize = [20,5])
term_counts = df_loan_status.Term.value_counts(normalize=True)
listing_category_counts = df_loan_status.ListingCategory.value_counts(normalize=True)
sns.barplot(x = term_counts.index, y = term_counts.values, color=default_color, ax = axs[0]).set(title = 'Pourcentages des durée de crédit en mois')
sns.barplot(x = listing_category_counts.index, y = listing_category_counts.values, color=default_color, ax = axs[1]).set(title = 'Pourcentages des catégories du listing')
plt.show()
On remarque que la plupart des crédits (environ 90%) ont une durée de 36 mois, alors que environ 40% des catégories de listing sont Debt consolidation.
# variable continue : LoanOriginalAmount
bins = 10**np.arange(3,5 + 0.08,0.08)
sns.histplot(df_loan_status.LoanOriginalAmount,bins = bins)
plt.xscale('log')
ticks = 10**np.linspace(3, 5, 6)
plt.xticks(ticks, ['{:.0f}'.format(tick) for tick in ticks])
plt.title("Distribution des montants de crédits")
plt.show()
np.log10(df_loan_status.LoanOriginalAmount).skew(), np.log10(df_loan_status.LoanOriginalAmount).kurtosis()
(-0.024361589902478987, -0.5456539161150977)
D'après cet histogramme on voit que la plupart des montants sont compris entre 2000 et 15000.
Passons maintenant à l'exploration des variables de type float.
continous_vars = df_loan_status.select_dtypes('float64').drop('BorrowerAPR', axis = 1)
continous_vars.describe()
| CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | MonthlyLoanPayment | |
|---|---|---|---|---|---|---|
| count | 43079.000000 | 43079.000000 | 43079.000000 | 43079.000000 | 43079.000000 | 43079.000000 |
| mean | 680.972632 | 699.972632 | 0.852741 | 0.244537 | 5238.581826 | 231.511189 |
| std | 68.086344 | 68.086344 | 0.175669 | 0.273938 | 3924.864430 | 186.331591 |
| min | 460.000000 | 479.000000 | 0.000000 | 0.010000 | 100.000000 | 0.000000 |
| 25% | 640.000000 | 659.000000 | 0.760000 | 0.130000 | 3000.000000 | 105.010000 |
| 50% | 680.000000 | 699.000000 | 0.920000 | 0.200000 | 4375.000000 | 173.710000 |
| 75% | 720.000000 | 739.000000 | 1.000000 | 0.310000 | 6416.666667 | 309.610000 |
| max | 880.000000 | 899.000000 | 1.000000 | 10.010000 | 185081.750000 | 2251.510000 |
Pour chacune de ces variables on examinera leur distribution :
col_num = 2
row_num = 3
fig, axs = plt.subplots(row_num, col_num, figsize = [20,15])
count = 0
for i in range(row_num):
for j in range(col_num):
if count >= len(continous_vars.columns): break
# On a modifié le resultat de la fonction freedman_diaconis puisqu'on avait beaucoup de bruit au niveau de quelques
# histogrammes
axs[i,j].hist(continous_vars.iloc[:,count], bins = len(freedman_diaconis(continous_vars.iloc[:,count], 'bins'))//4)
axs[i,j].set_title(continous_vars.columns[count])
count = count+1
plt.show()
CreditScoreRangeLower et CreditScoreRangeUpper semblent à peu près les même avec une translation legère vers la droite pour range upper.TradesNeverDelinquent la distribution présente une forte inclinaison à droite, ce qui dit que les compte de crédit qui ont toujours des retard par rapport aux paiement sont consideré comme des valeurs extrèmes.DebtToIncomeRatio, cette distributioin est incliné à gauche et on a des valeurs extrèmes pour les valeurs supérieurs à 0.4MonthlyLoanPayment, cette distribution est incliné à gauche et la plupart des valeurs ne dépassent pas 250.On fera un zoom sur la distribution de DebtToIncomeRatio pour voir encore le valeurs en détail :
plt.hist(df_loan_status.DebtToIncomeRatio, bins = len(freedman_diaconis(df_loan_status.DebtToIncomeRatio, 'bins'))//3)
plt.title('Zoom to debt to income ratio distribution')
plt.xlim(right = 1)
(-0.49, 1.0)
Pour la distribution de StatedMonthlyIncome, on a besoin d'une transformation logarithmique puisqu'il y a une très grande marge entre les valeurs:
bins_range = 10**(np.arange(-1, 6+0.2, 0.2))
plt.figure()
plt.hist(df_loan_status['StatedMonthlyIncome'], bins = bins_range)
ticks = 10**np.linspace(-1, 5, 8)
plt.xscale('log')
plt.xticks(ticks, ['{:.0f}'.format(xtick) for xtick in ticks])
plt.xlim(left=200)
plt.title("Distribution des salaires mensuel en échelle logarithmique")
plt.show()
d'après l'histogramme on remarque que les salaires mensuels sont en générale compris entre 2000 et 13000.
On vérifiera maintenant si les outliers présents dans les distributions fortement incliné sont des valeurs correctes ou des erreurs de saisi:
def get_outliers(data):
"""Returns the list of outilers
Args:
data (pd.Series): The column of dataframe to verify
"""
IQR = data.quantile(0.75) - data.quantile(0.25)
min = data.quantile(0.25) - 1.5 * IQR
max = data.quantile(0.75) + 1.5 * IQR
return data[(data > max) | (data < min)]
# StatedMonthlyIncome:
df_loan_status.StatedMonthlyIncome.plot.box()
outliers = get_outliers(df_loan_status.StatedMonthlyIncome)
print(f"Total Outliers : {len(outliers)}")
outliers.describe()
Total Outliers : 2003
count 2003.000000 mean 16924.304377 std 9061.091766 min 11546.166667 25% 12500.000000 50% 14166.666667 75% 17500.000000 max 185081.750000 Name: StatedMonthlyIncome, dtype: float64
df_loan_status.query("StatedMonthlyIncome > 20000 & IncomeRange != '100,000+'").IncomeRange.value_counts(normalize=True)
Not displayed 0.50 50,000-74,999 0.25 75,000-99,999 0.25 Not employed 0.00 0 0.00 1-24,999 0.00 25,000-49,999 0.00 100,000+ 0.00 Name: IncomeRange, dtype: float64
On remarque que quelques lignes contiennent des marges de salaire annuel < 100k parmi les résultats où le salaire mensuel est > 20000. Voyons ces cas plus en détails:
df_loan_status.query("StatedMonthlyIncome > 20000 & IncomeRange in ['50,000-74,999', '75,000-99,999']")
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25495 | D | Professional | Full-time | 50,000-74,999 | 2007-04-19 | 0B603386231762702E5D8A4 | 36 | 7 | 600.0 | 619.0 | 0.95 | 0.40 | 64994.666667 | 13200 | 475.51 | Completed | 0.20614 |
| 93680 | B | Professional | Full-time | 75,000-99,999 | 2007-03-12 | 5F743381107437297CE1E52 | 36 | 7 | 680.0 | 699.0 | 0.76 | 0.13 | 78333.333333 | 9800 | 284.00 | Completed | 0.12500 |
df_loan_status.query("MemberKey in ['5F743381107437297CE1E52', '0B603386231762702E5D8A4']")
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25495 | D | Professional | Full-time | 50,000-74,999 | 2007-04-19 | 0B603386231762702E5D8A4 | 36 | 7 | 600.0 | 619.0 | 0.95 | 0.40 | 64994.666667 | 13200 | 475.51 | Completed | 0.20614 |
| 93680 | B | Professional | Full-time | 75,000-99,999 | 2007-03-12 | 5F743381107437297CE1E52 | 36 | 7 | 680.0 | 699.0 | 0.76 | 0.13 | 78333.333333 | 9800 | 284.00 | Completed | 0.12500 |
Apparement, on n'a que deux lignes où le salaire mensuel ne correspond pas au salaire annuel alors que le reste est définit comme Not available, puisqu'on n'a pas assez d'informations pour déterminer la valeur correcte on supprimera ces deux lignes.
df_loan_status.drop(df_loan_status.query("StatedMonthlyIncome > 20000 & IncomeRange in ['50,000-74,999', '75,000-99,999']").index, axis = 0, inplace = True)
# test
assert df_loan_status.query("StatedMonthlyIncome > 20000 & IncomeRange in ['50,000-74,999', '75,000-99,999']").shape[0] == 0
df_highest_salaries = df_loan_status.query("StatedMonthlyIncome > 20000")
df_highest_salaries.groupby('Occupation')['StatedMonthlyIncome'].median().sort_values()
Occupation Chemist 20221.333333 Accountant/CPA 20833.333333 Pilot - Private/Commercial 20833.333333 Homemaker 20833.333333 Nurse (LPN) 21666.666667 Truck Driver 22333.333333 Computer Programmer 22916.666667 Sales - Commission 22916.666667 Dentist 23750.000000 Attorney 24263.125000 Retail Management 24583.333333 Executive 25000.000000 Realtor 25000.000000 Doctor 25833.333333 Professional 26791.666667 Other 27083.333333 Analyst 29500.000000 Investor 34375.000000 Engineer - Chemical 43333.333333 Construction 51333.333333 Sales - Retail 66666.666667 Nurse (RN) 96266.500000 Name: StatedMonthlyIncome, dtype: float64
On remarque que quelques occupations (Nurse (RN), Sales-Retail) ont des salaires plus large que les meilleurs salaires en USA pour ces occupations, et la valeur définit représente le salaire max annuel plutôt que mensuel.
How much does a Retail Sales make in USA? The average retail sales salary in the USA is $31,200 per year or $16 per hour. Entry level positions start at $25,350 per year while most experienced workers make up to $55,000 per year. -- Source : www.talent.com
The average registered nurse salary in the USA is $73,638 per year or $37.76 per hour. Entry level positions start at $58,500 per year while most experienced workers make up to $114,952 per year. -- Source : www.talent.com
Pour les salaires du domaine de construction le max ne dépasse pas 130k-140k pour les managers par an, donc il va falloir vérifier ce cas.
df_highest_salaries.query('Occupation in ["Nurse (RN)", "Sales - Retail"]')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 59928 | NR | Nurse (RN) | Full-time | 100,000+ | 2010-03-24 | EBDE3419188956387D701BC | 36 | 1 | 680.0 | 699.0 | 0.89 | 0.01 | 96266.500000 | 3000 | 107.59 | Completed | 0.22135 |
| 80092 | NR | Sales - Retail | Employed | 100,000+ | 2012-08-31 | A34B3554452628023256E73 | 12 | 3 | 680.0 | 699.0 | 0.71 | 0.10 | 66666.666667 | 15000 | 1338.28 | Completed | 0.18526 |
On n'a que deux enregistrements avec des salires incorrectes, donc on supprimera ces deux lignes.
df_loan_status.drop([59928, 59928], axis=0, inplace = True)
# test
assert not df_loan_status.index.isin([59928, 59928]).any()
Vérifions maintenant les salaires de l'occupation construction:
df_loan_status.query('Occupation == "Construction" & StatedMonthlyIncome > 20000')
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15814 | B | Construction | Full-time | 100,000+ | 2007-09-12 | 4E3C33987263003958CCBCE | 36 | 7 | 700.0 | 719.0 | 1.00 | 0.05 | 58616.666667 | 25000 | 941.88 | Defaulted | 0.21739 |
| 62864 | A | Construction | Self-employed | 100,000+ | 2007-12-04 | EC04340374151634956586A | 36 | 7 | 720.0 | 739.0 | 0.88 | 0.06 | 51333.333333 | 25000 | 860.52 | Defaulted | 0.15211 |
| 94583 | AA | Construction | Full-time | 100,000+ | 2007-09-12 | 91473381930294171105375 | 36 | 7 | 800.0 | 819.0 | 0.97 | 0.22 | 20833.333333 | 2500 | 79.50 | Completed | 0.09688 |
| 100431 | C | Construction | Full-time | 100,000+ | 2007-05-16 | C8003386302521489A5DE84 | 36 | 7 | 640.0 | 659.0 | 0.65 | 0.04 | 70350.000000 | 25000 | 1047.64 | Chargedoff | 0.29776 |
| 101793 | AA | Construction | Full-time | 100,000+ | 2008-07-24 | C1C734249229525336F3411 | 36 | 1 | 780.0 | 799.0 | 1.00 | 0.08 | 29166.666667 | 20000 | 662.85 | Completed | 0.12550 |
Pour le max on est allé jusqu'à 20000 par mois, et on se retrouve avec 5 lignes qui dépaseent cette valeur. Malgré que les deux lignes où le salaire ne dépasse pas 30000 ont des valuers élévé au niveau des indicateurs(score/note de crédit) et le statut est completed, mais le salaire reste très élévé par rapport au max, donc on supprimera ces cinq lignes pour ne pas fausser notre étude.
The average construction manager salary in the USA is $87,921 per year or $45.09 per hour. Entry level positions start at $65,011 per year while most experienced workers make up to $130,000 per year. -- Source : www.talent.com
df_loan_status.drop(df_loan_status.query('Occupation == "Construction" & StatedMonthlyIncome > 20000').index, axis=0,inplace = True)
# Test
assert df_loan_status.query('Occupation == "Construction" & StatedMonthlyIncome > 20000').shape[0] == 0
Les variables d'intérêt dans cet étude sont le taux d'intéret et le statut du crédit. On remarque que pour le statut de crédit on a un très grand pourcentage pour la valeur
Completedpar rapport aux statutsDefaultedetChargedoff. Pour le taux d'intérét, la distribution représente une faible inclinainson vers la droite et est largement applatit, les valeurs entre 0.1 et 0.3 semblent avoir à peu près la même fréquence avec un mode au tour de 0.35.
Pour les autres variables, il fallait effectué une transformation logarithmique pour le montant du crédit et le salaire mensuel puisqu'il y avait une très grande différence entre les valeurs. On avait aussi des outliers au niveau du salaire mensuel qui depassait la limite pour certaines occuaptions.
Dans une première partie on étudiera la relation entre les targets et les varaibles selectionné précédemment, on commencera d'abord par étudier l'effet des variables sur le taux d'interêt:
# make box plots for each variable
fig, axs = plt.subplots(3,1)
fig.set_figwidth(20)
fig.set_figheight(20)
sns.boxplot(data = df_loan_status, x = 'CreditGrade', y = 'BorrowerAPR', ax = axs[0]).set(title="Taux d'intérêt vs Credit grade")
sns.boxplot(data = df_loan_status, x = 'EmploymentStatus', y = 'BorrowerAPR', ax = axs[1]).set(title="Taux d'intérêt vs Employment status")
sns.boxplot(data = df_loan_status, x = 'IncomeRange', y = 'BorrowerAPR', ax = axs[2]).set(title="Taux d'intérêt vs Income range")
plt.show()
IncomeRange il apparait qu'il y a une décroissance languissante du taux d'intérêt avec la croissance du salaire.CreditGrade on voit bien qu'il y a une decroissance du taux de crédit pour les meilleurs notes.Self Employed a la médianne et l'écart interquartile les plus faibles avec quelques outliers pour des taux élévé, alors que (Full-time, part-time et retired) ont à peu près la même medianne sauf que les retraités ont un écart interquartile plus large.plt.figure(figsize=(10,6))
sns.lineplot(data=df_loan_status, x = df_loan_status.LoanOriginationDate.dt.year, y = 'BorrowerAPR')
plt.title("Taux d'intérêt par année")
plt.show()
On remarque d'après le graphe précédent qu'il y avait une très grande croissance entre les années 2007 et 2011, et puis pour les années suivantes le taux d'interêt a commencé à décroitre. Prenant un regard encors plus détaillé sur les années 2007 et 2008.
df_2007 = df_loan_status.loc[df_loan_status.LoanOriginationDate.dt.year == 2007]
df_2008 = df_loan_status.loc[df_loan_status.LoanOriginationDate.dt.year == 2008]
fig, axs = plt.subplots(2,1)
fig.set_figwidth(10)
fig.set_figheight(10)
sns.lineplot(data=df_2007, x = df_2007.LoanOriginationDate.dt.month, y = 'BorrowerAPR', ax = axs[0]).set(title="Taux d'intérêt pour l'année 2007")
sns.lineplot(data=df_2008, x = df_2008.LoanOriginationDate.dt.month, y = 'BorrowerAPR', ax = axs[1]).set(title="Taux d'intérêt pour l'année 2008")
plt.show()
verifions si la différence entre avant et après l'année 2011 est dû à la l'offre et la demande des crédit sur Prosper:
df_before_2011 = df_loan_status.loc[df_loan_status.LoanOriginationDate.dt.year <= 2011]
df_after_2011 = df_loan_status.loc[df_loan_status.LoanOriginationDate.dt.year > 2011]
print(df_before_2011.shape[0], df_after_2011.shape[0])
33453 9618
On constate qu' effectivement avant l'année 2011 le nombre de demandeurs de crédit est beaucoup plus elevé qu' après 2011.
plt.figure()
sns.boxplot(data = df_loan_status, x = 'Term', y = 'BorrowerAPR')
plt.title("Taux d'intérêt vs durrée du crédit")
plt.show()
Pour les crédit de 36 mois et 12 mois on ne voit pas une grande différence entre les médiannes, mais pour 60 mois les taux de crédit sont clairement plus elevé par rapport à 12 et 36 mois.
plt.figure(figsize=(10,8))
sns.pointplot(data=df_loan_status, x = 'ListingCategory', y = 'BorrowerAPR', join=False)
plt.title("Taux d'intérêt vs catégorie du crédit")
plt.show()
Il apparait que les taux de crédit pour la catégorie Boat sont les plus faible, c'est peut être dû au fait que c'est un crédit sécurisé et le bateau sera utilisé comme garantie de prêt. Alors que parmi les taux les plus elevé on trouve les prêts des dépenses ménagères, dépenses médicals ou dentals, des chirurgies cosmétiques, etc.
On passe maintenant aux variables continues et on étudiera leur relation avec le taux de crédit en utilisant des scatter plots:
continuous_vars_names = ['LoanOriginalAmount', 'CreditScoreRangeLower', 'TradesNeverDelinquent', 'DebtToIncomeRatio', 'StatedMonthlyIncome', 'MonthlyLoanPayment']
fig, axs = plt.subplots(2,3)
fig.set_figheight(10)
fig.set_figwidth(20)
i,j = 0,0
for name in continuous_vars_names:
if j>2 :
i,j = 1,0
axs[i,j].scatter(data=df_loan_status, x=name, y = 'BorrowerAPR', alpha=1/5)
axs[i,j].set_title(name)
j = j+1
plt.suptitle("Taux de crédit vs les variables continues")
plt.show()
continuous_vars_names.append("BorrowerAPR")
df_interval_vars = df_loan_status[continuous_vars_names]
df_interval_vars.corr().iloc[-1, :]
LoanOriginalAmount -0.236796 CreditScoreRangeLower -0.493453 TradesNeverDelinquent -0.222926 DebtToIncomeRatio 0.107052 StatedMonthlyIncome -0.114694 MonthlyLoanPayment -0.137907 BorrowerAPR 1.000000 Name: BorrowerAPR, dtype: float64
La correlation linéaire entre les variables et le taux d'intérêt est faible.
# Credit grade vs Income range
plt.figure(figsize=(20,10))
g = sns.FacetGrid(data=df_loan_status, col='IncomeRange',col_wrap=3, height= 4)
g.map(sns.countplot,"CreditGrade")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Income range vs Credit grade")
plt.show()
/home/wissal/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the countplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
<Figure size 2000x1000 with 0 Axes>
D'après le graphique précedent on remarque que la proportion des notes les plus faibles diminue avec la croissance du salaire
df_interval_vars = df_interval_vars.drop('BorrowerAPR', axis = 1)
# graphe pour matrice de correlation
plt.figure(figsize = [8, 5])
sns.heatmap(df_interval_vars.corr(), annot = True, fmt = '.3f',
cmap = 'vlag_r', center = 0)
plt.title("Correlation entre les variables continues")
plt.show()
plt.figure()
g = sns.PairGrid(data = df_interval_vars.sample(1000, replace=False))
g.map_offdiag(plt.scatter, alpha = 1/5)
g.map_diag(plt.hist)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Matrice de corrélation entre les variables continues")
plt.show()
<Figure size 640x480 with 0 Axes>
On a clairement une forte corrélation linéaire entre MonthlyLoanPayment et LoanOriginalAmount, sinon pour le reste des variables la corrélation est très faible.
Etudions maintenant la relation entre Term et les variables continues:
fig, axs = plt.subplots(2,3)
fig.set_figheight(10)
fig.set_figwidth(20)
# remove BorrowerAPR from the list
continuous_vars_names.pop()
i,j = 0,0
for name in continuous_vars_names:
if j>2 :
i,j = 1,0
sns.boxplot(data=df_loan_status, x='Term', y = name, ax = axs[i,j])
j = j+1
plt.suptitle("Durée du crédit vs varaibles continues")
plt.show()
On remarque que plus le montant est grand plus la période du crédit est longue, mais pour les autres variables on ne voit pas une croissance ou décroissance selon la période du crédit.
Avant de commencer à travailler sur LoanStatus, il va falloir équilibrer le nombre des échantillons de chaque statut pour ne pas biaiser les résultats.
df_loan_completed = df_loan_status.query('LoanStatus == "Completed"')
df_loan_defaulted= df_loan_status.query('LoanStatus == "Defaulted"')
df_loan_chargedoff = df_loan_status.query('LoanStatus == "Chargedoff"')
print(f"Completed : {len(df_loan_completed)} , Defaulted : {len(df_loan_defaulted)}, Chargedoff : {len(df_loan_chargedoff)}")
size = min(len(df_loan_completed), len(df_loan_defaulted), len(df_loan_chargedoff))
df_loan_completed = df_loan_completed.sample(size, replace=False, random_state=0)
df_loan_chargedoff = df_loan_chargedoff.sample(size, replace=False, random_state=0)
#test
assert df_loan_completed.shape[0] == df_loan_chargedoff.shape[0] == df_loan_defaulted.shape[0]
df_loan_eq_status = pd.concat([df_loan_chargedoff, df_loan_completed, df_loan_defaulted], axis=0)
# test
assert df_loan_eq_status.shape[0] == 3*size
df_loan_eq_status.sample(5)
Completed : 30722 , Defaulted : 3021, Chargedoff : 9328
| CreditGrade | Occupation | EmploymentStatus | IncomeRange | LoanOriginationDate | MemberKey | Term | ListingCategory | CreditScoreRangeLower | CreditScoreRangeUpper | TradesNeverDelinquent | DebtToIncomeRatio | StatedMonthlyIncome | LoanOriginalAmount | MonthlyLoanPayment | LoanStatus | BorrowerAPR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 40317 | NR | Nurse (RN) | Employed | 100,000+ | 2011-05-20 | 49093508505325401B5E518 | 60 | 1 | 760.0 | 779.0 | 0.98 | 0.18 | 10833.333333 | 15000 | 476.04 | Completed | 0.31420 |
| 104384 | NR | Other | Not available | 1-24,999 | 2012-01-09 | 6D5735355750710600EF664 | 36 | 7 | 660.0 | 679.0 | 0.93 | 0.86 | 1833.333333 | 4000 | 172.60 | Chargedoff | 0.35285 |
| 17150 | NR | Administrative Assistant | Employed | 50,000-74,999 | 2013-09-18 | B5E13692263463136E36028 | 36 | 1 | 680.0 | 699.0 | 0.95 | 0.17 | 5666.666667 | 12000 | 396.23 | Completed | 0.14409 |
| 58514 | NR | Social Worker | Employed | 50,000-74,999 | 2011-04-06 | 441F3511716750386A41C68 | 36 | 1 | 680.0 | 699.0 | 1.00 | 0.34 | 4750.000000 | 5000 | 201.43 | Completed | 0.29510 |
| 44267 | NR | Sales - Commission | Employed | 75,000-99,999 | 2011-03-28 | 44A935093080394464B8879 | 36 | 1 | 760.0 | 779.0 | 0.96 | 0.11 | 6666.666667 | 7000 | 214.70 | Chargedoff | 0.06888 |
# CreditGrade EmploymentStatus IncomeRange
fig, axs = plt.subplots(1,2)
fig.set_figwidth(20)
sns.countplot(data = df_loan_eq_status, x = 'CreditGrade', hue = 'LoanStatus', ax = axs[0]).set(title="Statut du crédit vs Credit grade")
sns.countplot(data = df_loan_eq_status, x = 'EmploymentStatus', hue = 'LoanStatus', ax = axs[1]).set(title="Statut du crédit vs statut d'emploiement")
plt.figure(figsize=(20,5))
# Pour vérifier si la proportion de completed augmente avec le salaire annuel
# on prendra des proportions égales
df_income_1 = df_loan_eq_status.query("IncomeRange == '1-24,999'")
df_income_25 = df_loan_eq_status.query("IncomeRange == '25,000-49,999'")
df_income_50 = df_loan_eq_status.query("IncomeRange == '50,000-74,999'")
df_income_75 = df_loan_eq_status.query("IncomeRange == '75,000-99,999'")
df_income_100 = df_loan_eq_status.query("IncomeRange == '100,000+'")
print(len(df_income_1), len(df_income_25), len(df_income_50), len(df_income_75), len(df_income_100))
size = min(len(df_income_1), len(df_income_25), len(df_income_50), len(df_income_75), len(df_income_100))
df_income_25 = df_income_25.sample(size, replace=False, random_state=0)
df_income_50 = df_income_50.sample(size, replace=False, random_state=0)
df_income_75 = df_income_75.sample(size, replace=False, random_state=0)
df_income_100 = df_income_100.sample(size, replace=False, random_state=0)
df_income_status = pd.concat([df_income_1, df_income_25, df_income_50, df_income_75, df_income_100])
# Test
assert df_income_status.shape[0] == 5*size
sns.countplot(data = df_income_status, x = 'IncomeRange', hue = 'LoanStatus').set(title="Statut du crédit vs Income range")
plt.legend(loc = 'upper left')
plt.show()
880 3495 2547 1108 993
CreditGrade à part la meilleur note AA, la proportion des crédit avec le statut Defaulted depasse la proportion des crédits avec le statut Completed, mais pour confirmer qu'effectivement le fait d'avoir une meilleur note n'augmente pas la probabilité de payer le crédit à temps il va falloir effectuer un test statistique (chi square). (Hypothèse à tester : il n y a aucune relation entre la note de crédit du créditeur et le statut du crédit)EmploymentStatus , au niveau de full-time la proportion du statut Defaulted est supérieur à Completed mais en général le statut Completed ne se distingue pas vraiment des proportions de Defaulted ou Chargedoff.Completed augmente avec l'augmentation des salaires. (Hypothèse à tester)# Période de cérdit vs statut du crédit
plt.figure()
sns.countplot(data = df_loan_eq_status, x = 'Term', hue = 'LoanStatus')
plt.legend(loc = 'upper left')
plt.title("Statut du crédit vs durée du crédit")
plt.show()
Il apparait qu'il n y'a aucune relation entre les deux.
fig, axs = plt.subplots(2,3)
fig.set_figwidth(20)
fig.set_figheight(8)
i,j = 0,0
for name in df_interval_vars.columns:
if j>2 : i,j=1,0
sns.boxplot(data = df_loan_eq_status, x = 'LoanStatus', y = name, ax = axs[i,j])
j = j+1
plt.suptitle("Statut du crédit vs variables continues")
plt.show()
On remarque que les mediannes du statut Completed pour les variables CreditScore et TradesNeverDelinquent est un peu elevé par rapport aux autres statuts. Il va falloir éffectuer un test ANOVA pour s'assurer que les moyennes des trois groupes sont différentes au niveau de deux variables.
Pour le taux de crédit on remarque ce qui suit :
- une décroissance du taux pour les meilleurs notes de crédit, on a aussi un taux faible au niveau du statut d'emploiement
Self Employedpar rapport aux autres statuts.- En analysant son évolution par rapport aux années d'initiation du crédit on constate que celui ci est aussi impacté par le nombre d'offre et demandes des crédits.
- On a remarqué aussi que pour les durée de crédits plus longue le taux est plus elevé.
- Les taux de crédit varient aussi par rapport à la catégorie de celui ci. Pour le statut du crédit on remarque ce qui suit:
- La proportion du statut
Cpompletedaugmente avec la salaire annuel.- Les mediannes des variables
CreditScoreetTradesNeverDelinquentsont plus elevé pour le statutCompleted. Si ce résultat est statistiquement significative cela signifiera qu'il y'a de grandes chances que le crédit aie le statut Completed pour les meilleurs notes et quand le pourcentage des crédits en retard du créditeur est très faible.
- On a une relation entre
CreditGradeetIncomeRange, tel que la proportion des notes les plus faibles diminue quand le salaire annuel augmente.- Il existe aussi une corrélation linéaire positive entre la durée et le montant du crédit,
Commençons par étudier la relation entre le taux d'intérêt, le statut d'emploiement et le salaire pour voir si on peu expliquer la différence entre les taux de crédit selon le statut d'emploiement plus en détail:
g = sns.FacetGrid(data = df_loan_status, col='EmploymentStatus', col_wrap=3, height =5)
g.map(sns.boxplot, 'IncomeRange', 'BorrowerAPR')
g.set_xticklabels(rotation=90)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Taux d'intérêt par statut d'emploiement et salaire")
plt.show()
/home/wissal/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
Là on voit clairement qu'il y'a une decroissance du taux d'intérêt avec l'augmentation du salaire dans le cas des employé (full-time, part-time) et retraité, mais ce qu'on remarque de plus est que dans le cas des entrepreneur un salaire annuel assez élévé n'implique pas forcément un taux de crédit faible.
On suspecte aussi qu'un créditeur avec un salaire élevé qui prend un montant de crédit faible aura un taux de crédit réduit, vérifions ce cas :
# est ce qu'un salaire élévé avec un montant de crédit faible => taux d'intérêt faible?
g = sns.FacetGrid(data=df_loan_status, col = 'IncomeRange', col_wrap=3, height=5)
g.map(plt.scatter, 'LoanOriginalAmount', 'BorrowerAPR')
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Taux d'intérêt par montant de crédit et salaire")
plt.xlim(right=10000)
plt.show()
Apparement, on n'a pas cette relation, peut être qu'il y a d'autres facteurs qui entrent en relation. essayons encore en ajoutant CreditScore:
def facet_scatter(x, y, c, **kwargs):
kwargs.pop("color")
plt.scatter(x, y, c=c, **kwargs)
df_loan_status['LoanAmountLogScale'] = np.log10(df_loan_status['LoanOriginalAmount'])
g = sns.FacetGrid(data=df_loan_status, col = 'IncomeRange', col_wrap=3, height=5)
g.map(facet_scatter, 'LoanAmountLogScale', 'BorrowerAPR', 'CreditScoreRangeLower', cmap='vlag_r')
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Taux d'intérêt par montant de crédit, salaire et score de créditeur")
xticks = 10**np.linspace(df_loan_status['LoanAmountLogScale'].min(), df_loan_status['LoanAmountLogScale'].max(), 11)
label_format = '{:,.0f}'
for ax in g.axes.flat:
ax.set_xticklabels([label_format.format(xtick) for xtick in xticks], rotation=90)
plt.colorbar()
plt.show()
/tmp/ipykernel_4433/643556429.py:13: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels([label_format.format(xtick) for xtick in xticks], rotation=90)
On remarque qu'il y a une croissance languissante du taux de crédit selon le montant pour le groupe où les scores sont élevé mais le degré de croissance ne varie pas selon les salaires.
Commençons maintenant à invetiguer les relations avec le statut du crédit:
Là on continuera d'abord par invetiguer la relation entre le montant, le salaire et le statut
plt.figure()
g = sns.FacetGrid(data=df_loan_eq_status, col='IncomeRange', col_wrap=4)
g.map(sns.pointplot, 'LoanStatus', 'LoanOriginalAmount')
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Statut du crédit par montant et salaire")
plt.show()
/home/wissal/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
<Figure size 640x480 with 0 Axes>
Alors, là on remarque dèja que les gens avec des salaires élevé prennent des montants de crédit plus importants, et on voit aussi que pour chaque marge de salaire les crédits avec statut Completed ont des montants plus faibles par rapport aux montant des crédits Defaulted ou Chargedoff. Donc on peut dire que plus le salaire annuel depasse largement le montant du crédit plus on a de chance d'avoir un crédit avec le statut Completed.
Essayons maintenant d'étudier la relation entre les variables (TradesNeverDelinquent, IncomeRange, DebtToIncomeRatio):
# scatter plot avec facetage
plt.figure()
g = sns.FacetGrid(data=df_loan_status.sample(3000, replace=False, random_state=0), col = 'IncomeRange', col_wrap=3)
g.map(sns.regplot, 'TradesNeverDelinquent', 'DebtToIncomeRatio', scatter_kws = {'alpha':1/5})
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("DebtToIncomeRatio vs IncomeRange vs TradesNeverDelinquent")
plt.ylim(top=4)
plt.show()
<Figure size 640x480 with 0 Axes>
On remarque ici que plus le salaire est élévé le nuage de point devient plus concentré dans la partie droite inférieur, ce qui dit que plus le salaire est élevé plus le quotiont dette/revenu est faible et le pourcentage des comptes de crédits où il n y a eu aucun retard est elevé.
g = sns.FacetGrid(data=df_loan_status, col = 'IncomeRange', col_wrap=5)
g.map(sns.pointplot, 'CreditGrade', 'DebtToIncomeRatio')
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("DebtToIncomeRatio vs IncomeRange vs CreditGrade")
plt.ylim(top=0.8)
plt.show()
/home/wissal/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
Pour les salaires inférieur à 25000 les créditeus avec des meilleurs notes n'ont pas forcement un quotient dette/revenu faible, alors que pour les salaires qui dépassent 50000 on voit une décroissance de DebtToIncomeRatio au niveau des trois notes les plus élevé.
Avant on avait trouvé que la proportion de Defaulted dépassait la proportion de Completed pour le type d'emploiement full-time et on se demande si ceci peut être du au montant du crédit:
plt.figure(figsize=(15,6))
sns.pointplot(data=df_loan_eq_status, x = 'EmploymentStatus', y = 'LoanOriginalAmount', hue='LoanStatus', dodge=0.2, palette = 'Greens', join=False)
plt.title("Statut du crédit par montant du crédit et statut d'emploiement")
plt.show()
On remarque que dans la plus part des cas (à part le statut Employed qui regroupe d'autres catégories) les crédits qui ont le statut Completed ont des montants plus faible que les montants des autres statuts.
- Après avoir effectué une analyse bivarié entre le taux d'intérêt et le statut d'emploiement on avait remarqué que pour
Self Employedle taux était le plus faible, alors que quand on a encore analysé cette relation avec le salaire annuel on s'apperçoit qu'on n'a pas une décroissance du taux d'intérêt avec la croissance du salaire (ce qui n'est pas le cas pour les autres statuts d'emploiement) et que la valeur du taux ne varie pas trop entre un salaire faible et un salaire élevé.- Etant motivé par l'idée que les gens bien payé qui prennent des mantants de crédit faibles augmentera la probabilité d'avoir le statut
Completed, on a investigué la relation entre salaire/montant/statut du crédit. Après cet investigation on s'est apperçu que les gens avec des salaires élevé prennent des montants de crédit plus importants, et on voit aussi que pour chaque marge de salaire les crédits avec statutCompletedont des montants plus faibles par rapport aux montant des créditsDefaultedouChargedoff. Donc on peut faire une hypothèse dans ce sens (plus le salaire annuel dépasse largement le montant du crédit plus on a de chance d'avoir un crédit avec le statutCompleted).
- Au niveau d'interactions entre les variables, on a commencé par investiguer la relation entre les variables suivantes : (
DebtToIncomeRatio,TradesneverDelinquentetIncomeRange), on s'aperçoit alors que plus le salaire est élevé plus le quotiont dette/revenu est faible et le pourcentage des comptes de crédits où il n y a eu aucun retard est élevé.- En ce qui concerne la relation entre (
CreditGrade,DebtToIncomeRatioetIncomeRange) on remarque que pour les salaires inférieur à 25000 les créditeus avec des meilleurs notes n'ont pas forcément un quotient dette/revenu faible, alors que pour les salaires qui dépassent 50000 on voit une décroissance deDebtToIncomeRatioau niveau des trois notes les plus élevé.
Lors de l'exploration, on a trouvé que le taux d'intérêt est influencé par la note du créditeur, le salaire et la différence entre offre et demande des crédits, en plus les entrepreneurs ont des taux d'intérêt plus faibles par rapport aux autres statuts d'emploiement et que celui ci n'est pas impacté par leur revenu. En ce qui concerne le statut du crédit, on remarque que la proportion de celui ci augmente avec le niveau du salaire, et que plus celui ci dépasse largement le montant du crédit plus la probabilité d'observer le statut
Completedest élevé, on s'apperçoit aussi que quelques paramètres du profil du créditeur comme le score de créditeur et le pourcentage des crédits en retard entrent aussi en relation pour détérminer le statut final du crédit.